diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index a15b75165..bd99c53a3 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -68,6 +68,7 @@ public class Balancer { private static final String indexSuffix = "A.db"; private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; + private static final int MAX_DOUBLE_PUSH_CHECK = 100000; // class variables filled with external values private final File cacheStacksPath; @@ -274,7 +275,7 @@ public class Balancer { if (this.double_push_check.has(hash)) return "double occurrence in double_push_check"; if (this.urlFileIndex.has(hash)) return "double occurrence in urlFileIndex"; - if (this.double_push_check.size() > 10000 || MemoryControl.shortStatus()) this.double_push_check.clear(); + if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear(); this.double_push_check.put(hash); // add to index diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 77f3e59df..bf0bdca41 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -465,14 +465,14 @@ public final class CrawlStacker { ((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago."); } else { if (dbocc == null) { - return "double in: LURL-DB"; + return "double in: LURL-DB, oldDate = " + oldDate.toString(); } if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:"); if (dbocc == HarvestProcess.ERRORS) { final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); - return "double in: errors (" + errorEntry.anycause() + ")"; + return "double in: errors (" + errorEntry.anycause() + "), oldDate = " + oldDate.toString(); } - return "double in: " + dbocc.toString(); + return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString(); } } diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 7edae18f5..f0e5fad87 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -148,9 +148,9 @@ public class CrawlQueues { if (this.errorURL.exists(hash)) { return HarvestProcess.ERRORS; } - if (this.noticeURL.existsInStack(hash)) { - return HarvestProcess.CRAWLER; - } + //if (this.noticeURL.existsInStack(hash)) { + // return HarvestProcess.CRAWLER; + //} // this is disabled because it prevents propert crawling of smb shares. The cause is unknown for (final Loader worker: this.workers.values()) { if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) { return HarvestProcess.WORKER; diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index cd43677eb..1070100ad 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -235,6 +235,7 @@ public class RobotsTxt { } public void ensureExist(final MultiProtocolURI theURL, final Set thisAgents, boolean concurrent) { + if (theURL.isLocal()) return; final String urlHostPort = getHostPort(theURL); if (urlHostPort == null) return; final BEncodedHeap robotsTable;