fix for crawl limit for number of pages fail

pull/1/head
Michael Peter Christen 11 years ago
parent 2645dc816a
commit 025516f682

@ -380,19 +380,6 @@ public final class CrawlStacker {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage(); return "exception during double-test: " + e.getMessage();
} }
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toNormalform(false);
if (recrawl) {
if (CrawlStacker.log.isFine())
CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages(); final int maxAllowedPagesPerDomain = profile.domMaxPages();
@ -410,6 +397,20 @@ public final class CrawlStacker {
} }
*/ */
} }
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toNormalform(false);
if (recrawl) {
if (CrawlStacker.log.isFine())
CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
return null; return null;
} }

@ -379,8 +379,12 @@ public class HostQueue implements Balancer {
if (this.has(hash)) return "double occurrence in urlFileIndex"; if (this.has(hash)) return "double occurrence in urlFileIndex";
// increase dom counter // increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) { if (profile != null) {
profile.domInc(entry.url().getHost()); int maxPages = profile.domMaxPages();
if (maxPages != Integer.MAX_VALUE && maxPages > 0) {
String host = entry.url().getHost();
profile.domInc(host);
}
} }
// add to index // add to index

Loading…
Cancel
Save