- fix for wrong entries in NOLOAD indexing queue (that caused that urls had been only indexed based on their url and not loaded)

- patch for better urls to solr admin interface

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7938 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 2842ce30d6
commit 2cba860693

@ -44,7 +44,6 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
@ -52,7 +51,6 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.workflow.WorkflowProcessor;
import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import de.anomic.crawler.ResultURLs.EventOrigin;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.FTPLoader;
@ -93,18 +91,18 @@ public final class CrawlStacker {
}
private Map<String, DomProfile> doms;
private final Map<String, DomProfile> doms;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker(
CrawlQueues cq,
CrawlSwitchboard cs,
Segment indexSegment,
yacySeedDB peers,
boolean acceptLocalURLs,
boolean acceptGlobalURLs,
FilterEngine domainList) {
final CrawlQueues cq,
final CrawlSwitchboard cs,
final Segment indexSegment,
final yacySeedDB peers,
final boolean acceptLocalURLs,
final boolean acceptGlobalURLs,
final FilterEngine domainList) {
this.nextQueue = cq;
this.crawler = cs;
this.indexSegment = indexSegment;
@ -122,17 +120,17 @@ public final class CrawlStacker {
}
private void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = doms.get(domain);
final DomProfile dp = this.doms.get(domain);
if (dp == null) {
// new domain
doms.put(domain, new DomProfile(referrer, depth));
this.doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
}
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = doms.entrySet().iterator();
final Iterator<Map.Entry<String, DomProfile>> domnamesi = this.doms.entrySet().iterator();
String domname="";
Map.Entry<String, DomProfile> ey;
DomProfile dp;
@ -195,7 +193,7 @@ public final class CrawlStacker {
return false;
}
public Request job(Request entry) {
public Request job(final Request entry) {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
@ -204,7 +202,7 @@ public final class CrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
nextQueue.errorURL.push(entry, ASCII.getBytes(peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
this.nextQueue.errorURL.push(entry, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
} catch (final Exception e) {
CrawlStacker.this.log.logWarning("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e);
@ -216,25 +214,25 @@ public final class CrawlStacker {
public void enqueueEntry(final Request entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
if (this.log.isFinest()) this.log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + ((entry.initiator() == null) ? "" : ASCII.String(entry.initiator())) + ", name=" + entry.name() + ", appdate=" + entry.appdate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) {
try {
this.fastQueue.enQueue(entry);
//this.dnsHit++;
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
} else {
try {
this.slowQueue.enQueue(entry);
this.dnsMiss++;
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
Log.logException(e);
}
}
}
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, boolean replace) {
public void enqueueEntriesAsynchronous(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) {
new Thread() {
public void run() {
enqueueEntries(initiator, profileHandle, hyperlinks, true);
@ -242,15 +240,15 @@ public final class CrawlStacker {
}.start();
}
private void enqueueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, Properties> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, Properties> e: hyperlinks.entrySet()) {
private void enqueueEntries(final byte[] initiator, final String profileHandle, final Map<MultiProtocolURI, Properties> hyperlinks, final boolean replace) {
for (final Map.Entry<MultiProtocolURI, Properties> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI(e.getKey());
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
this.indexSegment.urlMetadata().remove(urlhash);
this.nextQueue.urlRemove(urlhash);
String u = url.toNormalform(true, true);
if (u.endsWith("/")) {
@ -259,11 +257,11 @@ public final class CrawlStacker {
u = u + "/index.html";
}
try {
byte[] uh = new DigestURI(u, null).hash();
indexSegment.urlMetadata().remove(uh);
final byte[] uh = new DigestURI(u, null).hash();
this.indexSegment.urlMetadata().remove(uh);
this.nextQueue.noticeURL.removeByURLHash(uh);
this.nextQueue.errorURL.remove(uh);
} catch (MalformedURLException e1) {}
} catch (final MalformedURLException e1) {}
}
if (url.getProtocol().equals("ftp")) {
@ -301,12 +299,12 @@ public final class CrawlStacker {
DigestURI url = null;
try {
url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + MultiProtocolURI.escape(entry.name));
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
continue;
}
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
CrawlStacker.this.indexSegment.urlMetadata().remove(urlhash);
cq.noticeURL.removeByURLHash(urlhash);
cq.errorURL.remove(urlhash);
}
@ -325,8 +323,8 @@ public final class CrawlStacker {
entry.size
));
}
} catch (IOException e1) {
} catch (InterruptedException e) {
} catch (final IOException e1) {
} catch (final InterruptedException e) {
}
}
}.start();
@ -338,9 +336,9 @@ public final class CrawlStacker {
* @return null if successfull, a reason string if not successful
*/
public String stackSimpleCrawl(final DigestURI url) {
CrawlProfile pe = this.crawler.defaultSurrogateProfile;
final CrawlProfile pe = this.crawler.defaultSurrogateProfile;
return stackCrawl(new Request(
peers.mySeed().hash.getBytes(),
this.peers.mySeed().hash.getBytes(),
url,
null,
"CRAWLING-ROOT",
@ -361,11 +359,11 @@ public final class CrawlStacker {
public String stackCrawl(final Request entry) {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final CrawlProfile profile = crawler.getActive(UTF8.getBytes(entry.profileHandle()));
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
String error;
if (profile == null) {
error = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(error);
this.log.logWarning(error);
return error;
}
@ -373,16 +371,16 @@ public final class CrawlStacker {
if (error != null) return error;
// store information
final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), UTF8.getBytes(peers.mySeed().hash));
final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || ASCII.String(entry.initiator()).equals("------------")) && profile.handle().equals(crawler.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(crawler.defaultRemoteProfile.handle());
final boolean local = Base64Order.enhancedCoder.equal(entry.initiator(), UTF8.getBytes(this.peers.mySeed().hash));
final boolean proxy = (entry.initiator() == null || entry.initiator().length == 0 || ASCII.String(entry.initiator()).equals("------------")) && profile.handle().equals(this.crawler.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(this.crawler.defaultRemoteProfile.handle());
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(peers.mySeed().isSenior()) ||
(peers.mySeed().isPrincipal())
(this.peers.mySeed().isSenior()) ||
(this.peers.mySeed().isPrincipal())
) /* qualified */;
if (!local && !global && !remote && !proxy) {
@ -393,7 +391,7 @@ public final class CrawlStacker {
long maxFileSize = Long.MAX_VALUE;
if (entry.size() > 0) {
String protocol = entry.url().getProtocol();
final String protocol = entry.url().getProtocol();
if (protocol.equals("http") || protocol.equals("https")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("ftp")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.ftp.maxFileSize", FTPLoader.DEFAULT_MAXFILESIZE);
if (protocol.equals("smb")) maxFileSize = Switchboard.getSwitchboard().getConfigLong("crawler.smb.maxFileSize", SMBLoader.DEFAULT_MAXFILESIZE);
@ -401,15 +399,15 @@ public final class CrawlStacker {
// check availability of parser and maxfilesize
String warning = null;
if (entry.size() > maxFileSize ||
if (entry.size() > maxFileSize /*||
(entry.url().getFileExtension().length() > 0 && TextParser.supports(entry.url(), null) != null)
) {
warning = nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
*/) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
}
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : this.nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
if (profile.domMaxPages() != Integer.MAX_VALUE) {
@ -420,23 +418,23 @@ public final class CrawlStacker {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LIMIT, entry);
} else if (local) {
if (proxy) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
} else if (proxy) {
if (remote) this.log.logWarning("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.CORE, entry);
} else if (remote) {
warning = nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry);
}
if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
}
public String checkAcceptance(final DigestURI url, final CrawlProfile profile, int depth) {
public String checkAcceptance(final DigestURI url, final CrawlProfile profile, final int depth) {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
@ -483,14 +481,14 @@ public final class CrawlStacker {
}
// check if the url is double registered
final String dbocc = nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
URIMetadataRow oldEntry = indexSegment.urlMetadata().load(url.hash());
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final URIMetadataRow oldEntry = this.indexSegment.urlMetadata().load(url.hash());
if (oldEntry == null) {
if (dbocc != null) {
// do double-check
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is double registered in '" + dbocc + "'.");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
@ -508,7 +506,7 @@ public final class CrawlStacker {
} else {
if (this.log.isInfo()) this.log.logInfo("URL '" + url.toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (dbocc.equals("errors")) {
ZURL.Entry errorEntry = nextQueue.errorURL.get(url.hash());
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
} else {
return "double in: " + dbocc;
@ -520,7 +518,7 @@ public final class CrawlStacker {
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages();
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) {
final DomProfile dp = doms.get(url.getHost());
final DomProfile dp = this.doms.get(url.getHost());
if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
return "crawl stack domain counter exceeded";
@ -559,7 +557,7 @@ public final class CrawlStacker {
// check if this is a local address and we are allowed to index local pages:
//boolean local = hostAddress.isSiteLocalAddress() || hostAddress.isLoopbackAddress();
//assert local == yacyURL.isLocalDomain(url.hash()); // TODO: remove the dnsResolve above!
InetAddress ia = Domains.dnsResolve(host);
final InetAddress ia = Domains.dnsResolve(host);
return (local) ?
("the host '" + host + "' is local, but local addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress())) :
("the host '" + host + "' is global, but global addresses are not accepted: " + ((ia == null) ? "null" : ia.getHostAddress()));

@ -25,10 +25,12 @@
package net.yacy.cora.services.federated.solr;
import java.io.IOException;
import java.net.InetAddress;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
@ -183,7 +185,11 @@ public class SolrChardingConnector {
public String[] getAdminInterfaceList() {
final String[] urlAdmin = new String[this.connectors.size()];
int i = 0;
for (final String u: this.urls) {
final InetAddress localhostExternAddress = Domains.myPublicLocalIP();
final String localhostExtern = localhostExternAddress == null ? "127.0.0.1" : localhostExternAddress.getHostAddress();
for (String u: this.urls) {
int p = u.indexOf("localhost"); if (p < 0) p = u.indexOf("127.0.0.1");
if (p >= 0) u = u.substring(0, p) + localhostExtern + u.substring(p + 9);
urlAdmin[i++] = u + (u.endsWith("/") ? "admin/" : "/admin/");
}
return urlAdmin;

Loading…
Cancel
Save