refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5723 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 16 years ago
parent 83792d9233
commit fd0976c0a7

File diff suppressed because it is too large Load Diff

@ -41,17 +41,17 @@ import de.anomic.server.serverProcessor;
import de.anomic.yacy.yacyURL;
public final class CrawlStacker {
final Log log = new Log("STACKCRAWL");
private Log log = new Log("STACKCRAWL");
private serverProcessor<CrawlEntry> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
private CrawlQueues nextQueue;
private plasmaWordIndex wordIndex;
private boolean acceptLocalURLs, acceptGlobalURLs;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker(CrawlQueues cq, plasmaWordIndex wordIndex, boolean acceptLocalURLs, boolean acceptGlobalURLs) {
this.nextQueue = cq;
this.wordIndex = wordIndex;
@ -59,10 +59,10 @@ public final class CrawlStacker {
this.dnsMiss = 0;
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
this.fastQueue = new serverProcessor<CrawlEntry>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new serverProcessor<CrawlEntry>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
this.log.logInfo("STACKCRAWL thread initialized.");
}
@ -74,20 +74,20 @@ public final class CrawlStacker {
this.fastQueue.clear();
this.slowQueue.clear();
}
public void announceClose() {
this.log.logInfo("Flushing remaining " + size() + " crawl stacker job entries.");
this.fastQueue.announceShutdown();
this.slowQueue.announceShutdown();
}
public void close() {
this.log.logInfo("Shutdown. waiting for remaining " + size() + " crawl stacker job entries. please wait.");
this.fastQueue.announceShutdown();
this.slowQueue.announceShutdown();
this.fastQueue.awaitShutdown(2000);
this.slowQueue.awaitShutdown(2000);
this.log.logInfo("Shutdown. Closing stackCrawl queue.");
clear();
@ -105,7 +105,7 @@ public final class CrawlStacker {
// we just don't know anything about that host
return false;
}
/*
public boolean job() {
if (this.fastQueue.queueSize() > 0 && job(this.fastQueue)) return true;
@ -113,7 +113,7 @@ public final class CrawlStacker {
return job(this.slowQueue);
}
*/
public CrawlEntry job(CrawlEntry entry) {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
@ -133,11 +133,11 @@ public final class CrawlStacker {
}
return null;
}
public void enqueueEntry(final CrawlEntry entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + entry.initiator() + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) {
try {
@ -149,89 +149,79 @@ public final class CrawlStacker {
} else {
try {
this.slowQueue.enQueue(entry);
this.dnsMiss++;
this.dnsMiss++;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System.currentTimeMillis();
String reason = null; // failure reason
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
if (!nextQueue.isSupportedProtocol(urlProtocol)) {
reason = "unsupported protocol";
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
reason = "denied_(" + urlRejectReason + ")";
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) {
reason = "url in blacklist";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url in blacklist";
}
final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
if (profile == null) {
final String errorMsg = "LOST STACKER PROFILE HANDLE '" + entry.profileHandle() + "' for URL " + entry.url();
log.logWarning(errorMsg);
return errorMsg;
}
// filter with must-match
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url does not match must-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url does not match must-match filter";
}
// filter with must-not-match
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url matches must-not-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url matches must-not-match filter";
}
// deny cgi
if (entry.url().isCGI()) {
reason = "cgi url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "cgi url not allowed";
}
// deny post properties
if (entry.url().isPOST() && !(profile.crawlingQ())) {
reason = "post url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "post url not allowed";
}
final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
reason = "url does not match domain filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url does not match domain filter";
}
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(entry.url().getHost()))) {
reason = "domain counter exceeded";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "domain counter exceeded";
}
// check if the url is double registered
@ -260,36 +248,34 @@ public final class CrawlStacker {
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = "double " + dbocc;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double " + dbocc;
}
if ((oldEntry != null) && (!recrawl)) {
reason = "double LURL";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double LURL";
}
// show potential re-crawl
if (recrawl && oldEntry != null) {
if (this.log.isFine()) this.log.logFine("RE-CRAWL of URL '" + entry.url().toString() + "': this url was crawled " +
((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago.");
}
}
// store information
final boolean local = entry.initiator().equals(wordIndex.peers().mySeed().hash);
final boolean proxy = (entry.initiator() == null || entry.initiator().equals("------------")) && profile.handle().equals(wordIndex.defaultProxyProfile.handle());
final boolean remote = profile.handle().equals(wordIndex.defaultRemoteProfile.handle());
final boolean global =
final boolean global =
(profile.remoteIndexing()) /* granted */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
(entry.depth() == profile.depth()) /* leaf node */ &&
//(initiatorHash.equals(yacyCore.seedDB.mySeed.hash)) /* not proxy */ &&
(
(wordIndex.peers().mySeed().isSenior()) ||
(wordIndex.peers().mySeed().isPrincipal())
) /* qualified */;
if (!local && !global && !remote && !proxy) {
this.log.logSevere("URL '" + entry.url().toString() + "' cannot be crawled. initiator = " + entry.initiator() + ", profile.handle = " + profile.handle());
} else {
@ -309,10 +295,9 @@ public final class CrawlStacker {
nextQueue.noticeURL.push(NoticedURL.STACK_TYPE_REMOTE, entry);
}
}
return null;
}
/**
* Test a url if it can be used for crawling/indexing
@ -344,11 +329,11 @@ public final class CrawlStacker {
("the host '" + host + "' is local, but local addresses are not accepted") :
("the host '" + host + "' is global, but global addresses are not accepted");
}
public boolean acceptLocalURLs() {
return this.acceptLocalURLs;
}
public boolean acceptGlobalURLs() {
return this.acceptGlobalURLs;
}

Loading…
Cancel
Save