fixed bug with double-check in crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4269 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent b1e08d354c
commit 01554f4012

@ -389,7 +389,8 @@ public final class indexRAMRI implements indexRI {
} }
public int sizeContainer(String wordHash) { public int sizeContainer(String wordHash) {
return ((indexContainer) cache.get(wordHash)).size(); indexContainer c = (indexContainer) cache.get(wordHash);
return (c == null) ? 0 : c.size();
} }
public synchronized indexContainer getContainer(String wordHash, Set urlselection) { public synchronized indexContainer getContainer(String wordHash, Set urlselection) {

@ -354,7 +354,7 @@ public class plasmaCrawlProfile {
long l = Long.parseLong(r) * 60000L; long l = Long.parseLong(r) * 60000L;
return (l < 0) ? Long.MAX_VALUE : l; return (l < 0) ? Long.MAX_VALUE : l;
} catch (NumberFormatException e) { } catch (NumberFormatException e) {
return 0; return Long.MAX_VALUE;
} }
} }
public int domFilterDepth() { public int domFilterDepth() {

@ -457,12 +457,17 @@ public final class plasmaCrawlStacker extends Thread {
String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); String dbocc = sb.crawlQueues.urlExists(entry.url().hash());
indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0); indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0);
boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder());
// apply recrawl rule // do double-check
if ((dbocc != null) && (!(recrawl))) { if ((dbocc != null) && (!recrawl)) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason; return reason;
} }
if ((oldEntry != null) && (!recrawl)) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + "LURL)";
this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;
}
// show potential re-crawl // show potential re-crawl
if (recrawl) { if (recrawl) {

Loading…
Cancel
Save