From 01554f40129e066aef32ee5b66d0768b12ebe827 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 12 Dec 2007 01:32:25 +0000 Subject: [PATCH] fixed bug with double-check in crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4269 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/index/indexRAMRI.java | 3 ++- source/de/anomic/plasma/plasmaCrawlProfile.java | 2 +- source/de/anomic/plasma/plasmaCrawlStacker.java | 9 +++++++-- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 0d6bc4d7a..0ddd2abfd 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -389,7 +389,8 @@ public final class indexRAMRI implements indexRI { } public int sizeContainer(String wordHash) { - return ((indexContainer) cache.get(wordHash)).size(); + indexContainer c = (indexContainer) cache.get(wordHash); + return (c == null) ? 0 : c.size(); } public synchronized indexContainer getContainer(String wordHash, Set urlselection) { diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index 93c12ece0..4269c0d43 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -354,7 +354,7 @@ public class plasmaCrawlProfile { long l = Long.parseLong(r) * 60000L; return (l < 0) ? Long.MAX_VALUE : l; } catch (NumberFormatException e) { - return 0; + return Long.MAX_VALUE; } } public int domFilterDepth() { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 66ed2436b..d062d3688 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -457,12 +457,17 @@ public final class plasmaCrawlStacker extends Thread { String dbocc = sb.crawlQueues.urlExists(entry.url().hash()); indexURLEntry oldEntry = this.sb.wordIndex.loadedURL.load(entry.url().hash(), null, 0); boolean recrawl = (oldEntry != null) && ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) > profile.recrawlIfOlder()); - // apply recrawl rule - if ((dbocc != null) && (!(recrawl))) { + // do double-check + if ((dbocc != null) && (!recrawl)) { reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; this.log.logFine("URL '" + entry.url().toString() + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; } + if ((oldEntry != null) && (!recrawl)) { + reason = plasmaCrawlEURL.DOUBLE_REGISTERED + "LURL)"; + this.log.logFine("URL '" + entry.url().toString() + "' is double registered in 'LURL'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); + return reason; + } // show potential re-crawl if (recrawl) {