refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5723 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 16 years ago
parent 83792d9233
commit fd0976c0a7

@ -4,9 +4,9 @@
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
@ -282,27 +282,36 @@ public class CrawlQueues {
* @return
*/
private boolean crawlIsPossible(int stackType, final String type) {
int value;
if (noticeURL.stackSize(stackType) == 0) {
//log.logDebug("GlobalCrawl: queue is empty");
return false;
}
if (sb.webIndex.queuePreStack.size() >= (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30)) {
if (this.log.isFine()) log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")");
value = (int) sb.getConfigLong(plasmaSwitchboardConstants.INDEXER_SLOTS, 30);
if (sb.webIndex.queuePreStack.size() >= value) {
if (this.log.isFine()) {
log.logFine(type + "Crawl: too many processes in indexing queue, dismissed (" + "sbQueueSize=" + sb.webIndex.queuePreStack.size() + ")");
}
return false;
}
if (this.size() >= sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
value = (int) sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10);
if (this.size() >= value) {
// try a cleanup
this.cleanup();
}
// check again
if (this.size() >= sb.getConfigLong(plasmaSwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 10)) {
if (this.log.isFine()) log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
if (this.size() >= value) {
if (this.log.isFine()) {
log.logFine(type + "Crawl: too many processes in loader queue, dismissed (" + "cacheLoader=" + this.size() + ")");
}
return false;
}
if (sb.onlineCaution()) {
if (this.log.isFine()) log.logFine(type + "Crawl: online caution, omitting processing");
if (this.log.isFine()) {
log.logFine(type + "Crawl: online caution, omitting processing");
}
return false;
}
return true;

@ -42,7 +42,7 @@ import de.anomic.yacy.yacyURL;
public final class CrawlStacker {
final Log log = new Log("STACKCRAWL");
private Log log = new Log("STACKCRAWL");
private serverProcessor<CrawlEntry> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
@ -137,7 +137,7 @@ public final class CrawlStacker {
public void enqueueEntry(final CrawlEntry entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());
if (log.isFinest()) log.logFinest("ENQUEUE " + entry.url() + ", referer=" + entry.referrerhash() + ", initiator=" + entry.initiator() + ", name=" + entry.name() + ", load=" + entry.loaddate() + ", depth=" + entry.depth());
if (prefetchHost(entry.url().getHost())) {
try {
@ -162,31 +162,27 @@ public final class CrawlStacker {
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
final long startTime = System.currentTimeMillis();
String reason = null; // failure reason
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
if (!nextQueue.isSupportedProtocol(urlProtocol)) {
reason = "unsupported protocol";
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "unsupported protocol";
}
// check if ip is local ip address
final String urlRejectReason = urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
reason = "denied_(" + urlRejectReason + ")";
if (this.log.isFine()) this.log.logFine(reason + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("denied_(" + urlRejectReason + ") Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "denied_(" + urlRejectReason + ")";
}
// check blacklist
if (plasmaSwitchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, entry.url())) {
reason = "url in blacklist";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is in blacklist. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url in blacklist";
}
final CrawlProfile.entry profile = wordIndex.profilesActiveCrawls.getEntry(entry.profileHandle());
@ -198,36 +194,30 @@ public final class CrawlStacker {
// filter with must-match
if ((entry.depth() > 0) && !profile.mustMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url does not match must-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does not match must-match crawling filter '" + profile.mustMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url does not match must-match filter";
}
// filter with must-not-match
if ((entry.depth() > 0) && profile.mustNotMatchPattern().matcher(entry.url().toString()).matches()) {
reason = "url matches must-not-match filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' does matches do-not-match crawling filter '" + profile.mustNotMatchPattern().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url matches must-not-match filter";
}
// deny cgi
if (entry.url().isCGI()) {
reason = "cgi url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is CGI URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "cgi url not allowed";
}
// deny post properties
if (entry.url().isPOST() && !(profile.crawlingQ())) {
reason = "post url not allowed";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is post URL. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "post url not allowed";
}
final yacyURL referrerURL = (entry.referrerhash() == null) ? null : nextQueue.getURL(entry.referrerhash());
@ -239,18 +229,16 @@ public final class CrawlStacker {
// deny urls that do not match with the profile domain list
if (!(profile.grantedDomAppearance(entry.url().getHost()))) {
reason = "url does not match domain filter";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is not listed in granted domains. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "url does not match domain filter";
}
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(entry.url().getHost()))) {
reason = "domain counter exceeded";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. "+
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "domain counter exceeded";
}
// check if the url is double registered
@ -260,14 +248,12 @@ public final class CrawlStacker {
final boolean recrawl = (oldEntry != null) && (profile.recrawlIfOlder() > oldEntry.loaddate().getTime());
// do double-check
if ((dbocc != null) && (!recrawl)) {
reason = "double " + dbocc;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double " + dbocc;
}
if ((oldEntry != null) && (!recrawl)) {
reason = "double LURL";
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
if (this.log.isFine()) this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "double LURL";
}
// show potential re-crawl
@ -313,7 +299,6 @@ public final class CrawlStacker {
return null;
}
/**
* Test a url if it can be used for crawling/indexing
* This mainly checks if the url is in the declared domain (local/global)

Loading…
Cancel
Save