From c6a6f159e8f68c23ca04c30e9dfda551363c6945 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 16 Jul 2013 18:18:55 +0200 Subject: [PATCH] fix for crawl stack domain counter --- source/net/yacy/crawler/Balancer.java | 7 ++++++- source/net/yacy/crawler/CrawlStacker.java | 15 +++++---------- source/net/yacy/crawler/data/NoticedURL.java | 12 ++++++------ source/net/yacy/search/Switchboard.java | 4 ++-- 4 files changed, 19 insertions(+), 19 deletions(-) diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index f09abbbe5..f47073086 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -267,7 +267,7 @@ public class Balancer { * @throws IOException * @throws SpaceExceededException */ - public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException { + public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { @@ -278,6 +278,11 @@ public class Balancer { if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear(); this.double_push_check.put(hash); + // increase dom counter + if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) { + profile.domInc(entry.url().getHost()); + } + // add to index final int s = this.urlFileIndex.size(); this.urlFileIndex.put(entry.toRow()); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index ee8fbf685..f785524f6 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -341,30 +341,25 @@ public final class CrawlStacker { entry.url().getContentDomain() == ContentDomain.AUDIO || entry.url().getContentDomain() == ContentDomain.VIDEO || entry.url().getContentDomain() == ContentDomain.CTRL) { - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots); //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); return null; } - - // add domain to profile domain list - if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) { - profile.domInc(entry.url().getHost()); - } if (global) { // it may be possible that global == true and local == true, so do not check an error case against it if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, profile, this.robots); } else if (local) { if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle()); if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots); } else if (proxy) { if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle()); - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots); } else if (remote) { - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots); + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, profile, this.robots); } if (warning != null && this.log.isFine()) this.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning); diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 58b6f1a85..297c465cc 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -170,13 +170,13 @@ public class NoticedURL { * @param entry * @return null if this was successful or a String explaining what went wrong in case of an error */ - public String push(final StackType stackType, final Request entry, final RobotsTxt robots) { + public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) { try { switch (stackType) { - case LOCAL: return this.coreStack.push(entry, robots); - case GLOBAL: return this.limitStack.push(entry, robots); - case REMOTE: return this.remoteStack.push(entry, robots); - case NOLOAD: return this.noloadStack.push(entry, robots); + case LOCAL: return this.coreStack.push(entry, profile, robots); + case GLOBAL: return this.limitStack.push(entry, profile, robots); + case REMOTE: return this.remoteStack.push(entry, profile, robots); + case NOLOAD: return this.noloadStack.push(entry, profile, robots); default: return "stack type unknown"; } } catch (final Exception er) { @@ -269,7 +269,7 @@ public class NoticedURL { try { final Request entry = pop(fromStack, false, cs, robots); if (entry != null) { - final String warning = push(toStack, entry, robots); + final String warning = push(toStack, entry, null, robots); if (warning != null) { ConcurrentLog.warn("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d7c481ac0..e2b427fb8 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3071,9 +3071,9 @@ public final class Switchboard extends serverSwitch { } final String s; if (asglobal) { - s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots); + s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, profile, this.robots); } else { - s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots); + s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, profile, this.robots); } if (s != null) {