fixed a crawler bug where a double-occurring url was not re-crawled

because the double-check error was written to the error-db and never
deleted. No the error-db is cleared on every start and these
double-messages are not written to the error-db any more.
pull/1/head
Michael Peter Christen 12 years ago
parent 765943a4b7
commit e4cbe9232d

@ -148,7 +148,7 @@ public final class CrawlStacker {
final String rejectReason = stackCrawl(entry);
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
if (rejectReason != null && !rejectReason.startsWith("double in")) {
final CrawlProfile profile = this.crawler.getActive(UTF8.getBytes(entry.profileHandle()));
this.nextQueue.errorURL.push(entry, profile, ASCII.getBytes(this.peers.mySeed().hash), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1);
}
@ -436,11 +436,12 @@ public final class CrawlStacker {
if (dbocc == null) {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.anycause());
return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString();
}
if (this.log.isInfo()) this.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. ");
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
}
}

@ -85,6 +85,7 @@ public class CrawlQueues {
FileUtils.deletedelete(new File(queuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(sb.index.fulltext(), queuePath, ERROR_DB_FILENAME, false, sb.useTailCache, sb.exceed134217727);
this.delegatedURL = new ZURL(sb.index.fulltext(), queuePath, DELEGATED_DB_FILENAME, true, sb.useTailCache, sb.exceed134217727);
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
}
public void relocate(final File newQueuePath) {
@ -97,6 +98,7 @@ public class CrawlQueues {
FileUtils.deletedelete(new File(newQueuePath, ERROR_DB_FILENAME));
this.errorURL = new ZURL(this.sb.index.fulltext(), newQueuePath, ERROR_DB_FILENAME, false, this.sb.useTailCache, this.sb.exceed134217727);
this.delegatedURL = new ZURL(this.sb.index.fulltext(), newQueuePath, DELEGATED_DB_FILENAME, true, this.sb.useTailCache, this.sb.exceed134217727);
try {this.errorURL.clear();} catch (IOException e) {} // start with empty errors each time
}
public synchronized void close() {

Loading…
Cancel
Save