refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5723 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 16 years ago
parent 83792d9233
commit fd0976c0a7

@ -4,9 +4,9 @@
// //
// This is a part of YaCy, a peer-to-peer based web search engine // This is a part of YaCy, a peer-to-peer based web search engine
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate$
// $LastChangedRevision: 1986 $ // $LastChangedRevision$
// $LastChangedBy: orbiter $ // $LastChangedBy$
// //
// LICENSE // LICENSE
// //
@ -282,27 +282,36 @@ public class CrawlQueues {
* @return * @return
*/ */
private boolean crawlIsPossible(int stackType, final String type) { private boolean crawlIsPossible(int stackType, final String type) {
int value;
if (noticeURL.stackSize(stackType) == 0) { if (noticeURL.stackSize(stackType) == 0) {
//log.logDebug("GlobalCrawl: queue is empty"); //log.logDebug("GlobalCrawl: queue is empty");
return false; return false;
} }
if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)) { value = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30);
if (this.log.isFine()) log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")"); if (sb.webIndex.queuePreStack.size() >= value) {
if (this.log.isFine()) {
log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")");
}
return false; return false;
} }
if (this.size() >= sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) { value = (int) sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
if (this.size() >= value) {
// try a cleanup // try a cleanup
this.cleanup(); this.cleanup();
} }
// check again // check again
if (this.size() >= sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) { if (this.size() >= value) {
if (this.log.isFine()) log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")"); if (this.log.isFine()) {
log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
}
return false; return false;
} }
if (sb.onlineCaution()) { if (sb.onlineCaution()) {
if (this.log.isFine()) log.logFine(type + "Crawl: online caution, omitting processing"); if (this.log.isFine()) {
log.logFine(type + "Crawl: online caution, omitting processing");
}
return false; return false;
} }
return true; return true;

@ -42,7 +42,7 @@ import de.anomic.yacy.yacyURL;
public final class CrawlStacker { public final class CrawlStacker {
final Log log = new Log("STACKCRAWL"); private Log log = new Log("STACKCRAWL");
private serverProcessor<CrawlEntry> fastQueue, slowQueue; private serverProcessor<CrawlEntry> fastQueue, slowQueue;
private long dnsHit, dnsMiss; private long dnsHit, dnsMiss;
@ -137,7 +137,7 @@ public final class CrawlStacker {
public void enqueueEntry(final CrawlEntry entry) { public void enqueueEntry(final CrawlEntry entry) {
// DEBUG // DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth()); if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + entry.initiator() + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) { if (prefetchHost(entry.url().getHost())) {
try { try {
@ -162,31 +162,27 @@ public final class CrawlStacker {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System.currentTimeMillis(); final long startTime = System.currentTimeMillis();
String reason = null; // failure reason
// check if the protocol is supported // check if the protocol is supported
final String urlProtocol = entry.url().getProtocol(); final String urlProtocol = entry.url().getProtocol();
if (!nextQueue.isSupportedProtocol(urlProtocol)) { if (!nextQueue.isSupportedProtocol(urlProtocol)) {
reason = "unsupported protocol";
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "unsupported protocol";
} }
// check if ip is local ip address // check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(entry.url()); final String urlRejectReason = urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) { if (urlRejectReason != null) {
reason = "denied_(" + urlRejectReason + ")"; if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return "denied_(" + urlRejectReason + ")";
return reason;
} }
// check blacklist // check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) { if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) {
reason = "url in blacklist";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "url in blacklist";
} }
final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle()); final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
@ -198,36 +194,30 @@ public final class CrawlStacker {
// filter with must-match // filter with must-match
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) { if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url does not match must-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "url does not match must-match filter";
} }
// filter with must-not-match // filter with must-not-match
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) { if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url matches must-not-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "url matches must-not-match filter";
} }
// deny cgi // deny cgi
if (entry.url().isCGI()) { if (entry.url().isCGI()) {
reason = "cgi url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "cgi url not allowed";
} }
// deny post properties // deny post properties
if (entry.url().isPOST() && !(profile.crawlingQ())) { if (entry.url().isPOST() && !(profile.crawlingQ())) {
reason = "post url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "post url not allowed";
} }
final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash()); final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash());
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list // deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(entry.url().getHost()))) { if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
reason = "url does not match domain filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " + if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return reason; return "url does not match domain filter";
} }
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(entry.url().getHost()))) { if (!(profile.grantedDomCount(entry.url().getHost()))) {
reason = "domain counter exceeded"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " +
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+ "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return "domain counter exceeded";
return reason;
} }
// check if the url is double registered // check if the url is double registered
@ -260,14 +248,12 @@ public final class CrawlStacker {
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime()); final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check // do double-check
if ((dbocc != null) && (!recrawl)) { if ((dbocc != null) && (!recrawl)) {
reason = "double " + dbocc; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return "double " + dbocc;
return reason;
} }
if ((oldEntry != null) && (!recrawl)) { if ((oldEntry != null) && (!recrawl)) {
reason = "double LURL"; if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return "double LURL";
return reason;
} }
// show potential re-crawl // show potential re-crawl
@ -313,7 +299,6 @@ public final class CrawlStacker {
return null; return null;
} }
/** /**
* Test a url if it can be used for crawling/indexing * Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global) * This mainly checks if the url is in the declared domain (local/global)

Loading…
Cancel
Save