fix for crawl stack domain counter

pull/1/head
Michael Peter Christen 12 years ago
parent 93d1bac140
commit c6a6f159e8

@ -267,7 +267,7 @@ public class Balancer {
* @throws IOException
* @throws SpaceExceededException
*/
public String push(final Request entry, final RobotsTxt robots) throws IOException, SpaceExceededException {
public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException {
assert entry != null;
final byte[] hash = entry.url().hash();
synchronized (this) {
@ -278,6 +278,11 @@ public class Balancer {
if (this.double_push_check.size() > MAX_DOUBLE_PUSH_CHECK || MemoryControl.shortStatus()) this.double_push_check.clear();
this.double_push_check.put(hash);
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
// add to index
final int s = this.urlFileIndex.size();
this.urlFileIndex.put(entry.toRow());

@ -341,30 +341,25 @@ public final class CrawlStacker {
entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO ||
entry.url().getContentDomain() == ContentDomain.CTRL) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
//if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
}
// add domain to profile domain list
if (profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
}
if (global) {
// it may be possible that global == true and local == true, so do not check an error case against it
if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: global = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.GLOBAL, entry, profile, this.robots);
} else if (local) {
if (proxy) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, proxy = true, initiator = proxy" + ", profile.handle = " + profile.handle());
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: local = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots);
} else if (proxy) {
if (remote) this.log.warn("URL '" + entry.url().toString() + "' has conflicting initiator properties: proxy = true, remote = true, initiator = " + ASCII.String(entry.initiator()) + ", profile.handle = " + profile.handle());
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.LOCAL, entry, profile, this.robots);
} else if (remote) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, this.robots);
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.REMOTE, entry, profile, this.robots);
}
if (warning != null && this.log.isFine()) this.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed: " + warning);

@ -170,13 +170,13 @@ public class NoticedURL {
* @param entry
* @return null if this was successful or a String explaining what went wrong in case of an error
*/
public String push(final StackType stackType, final Request entry, final RobotsTxt robots) {
public String push(final StackType stackType, final Request entry, CrawlProfile profile, final RobotsTxt robots) {
try {
switch (stackType) {
case LOCAL: return this.coreStack.push(entry, robots);
case GLOBAL: return this.limitStack.push(entry, robots);
case REMOTE: return this.remoteStack.push(entry, robots);
case NOLOAD: return this.noloadStack.push(entry, robots);
case LOCAL: return this.coreStack.push(entry, profile, robots);
case GLOBAL: return this.limitStack.push(entry, profile, robots);
case REMOTE: return this.remoteStack.push(entry, profile, robots);
case NOLOAD: return this.noloadStack.push(entry, profile, robots);
default: return "stack type unknown";
}
} catch (final Exception er) {
@ -269,7 +269,7 @@ public class NoticedURL {
try {
final Request entry = pop(fromStack, false, cs, robots);
if (entry != null) {
final String warning = push(toStack, entry, robots);
final String warning = push(toStack, entry, null, robots);
if (warning != null) {
ConcurrentLog.warn("NoticedURL", "shift from " + fromStack + " to " + toStack + ": " + warning);
}

@ -3071,9 +3071,9 @@ public final class Switchboard extends serverSwitch {
}
final String s;
if (asglobal) {
s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, this.robots);
s = this.crawlQueues.noticeURL.push(StackType.GLOBAL, request, profile, this.robots);
} else {
s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, this.robots);
s = this.crawlQueues.noticeURL.push(StackType.LOCAL, request, profile, this.robots);
}
if (s != null) {

Loading…
Cancel
Save