fix for crawl limit for number of pages fail

pull/1/head
Michael Peter Christen 10 years ago
parent 2645dc816a
commit 025516f682

@ -380,19 +380,6 @@ public final class CrawlStacker {
ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage();
}
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toNormalform(false);
if (recrawl) {
if (CrawlStacker.log.isFine())
CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages();
@ -410,6 +397,20 @@ public final class CrawlStacker {
}
*/
}
final Long oldDate = oldEntry == null ? null : oldEntry.date;
if (oldDate == null) {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toNormalform(false);
if (recrawl) {
if (CrawlStacker.log.isFine())
CrawlStacker.log.fine("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.longValue()) / 60000 / 60 / 24) + " days ago.");
} else {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
return null;
}

@ -379,8 +379,12 @@ public class HostQueue implements Balancer {
if (this.has(hash)) return "double occurrence in urlFileIndex";
// increase dom counter
if (profile != null && profile.domMaxPages() != Integer.MAX_VALUE && profile.domMaxPages() > 0) {
profile.domInc(entry.url().getHost());
if (profile != null) {
int maxPages = profile.domMaxPages();
if (maxPages != Integer.MAX_VALUE && maxPages > 0) {
String host = entry.url().getHost();
profile.domInc(host);
}
}
// add to index

Loading…
Cancel
Save