From 2fa037ae1db03a221e522eab22ae126eae648214 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 23 Feb 2012 01:20:24 +0100 Subject: [PATCH] enhanced crawler --- source/de/anomic/crawler/Balancer.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index b6bf3af44..dd39affc3 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -517,11 +517,13 @@ public class Balancer { if (!this.domainStacks.isEmpty() && System.currentTimeMillis() - this.lastDomainStackFill < 60000L) return; this.domainStacks.clear(); this.lastDomainStackFill = System.currentTimeMillis(); - final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); - final CloneableIterator i = handles.keys(true, null); + //final HandleSet handles = this.urlFileIndex.keysFromBuffer(objectIndexBufferSize / 2); + //final CloneableIterator i = handles.keys(true, null); + final CloneableIterator i = this.urlFileIndex.keys(true, null); byte[] handle; String host; Request request; + int count = 0; while (i.hasNext()) { handle = i.next(); final Row.Entry entry = this.urlFileIndex.get(handle, false); @@ -533,6 +535,8 @@ public class Balancer { } catch (final RowSpaceExceededException e) { break; } + count++; + if (this.domainStacks.size() > 0 && count > 120 * this.domainStacks.size()) break; } Log.logInfo("BALANCER", "re-fill of domain stacks; fileIndex.size() = " + this.urlFileIndex.size() + ", domainStacks.size = " + this.domainStacks.size() + ", collection time = " + (System.currentTimeMillis() - this.lastDomainStackFill) + " ms"); this.domStackInitSize = this.domainStacks.size();