cleanup of checkAcceptanceInitially in CrawlStacker, should avoid

double-calling of solr
pull/1/head
Michael Peter Christen 10 years ago
parent dd5cdfe212
commit 62c72360ee

@ -466,7 +466,7 @@ public class HostBrowser {
prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false"); prop.putHTML("files_list_" + c + "_type_admin", admin ? "true" : "false");
StoreType type = (StoreType) entry.getValue(); StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;} try {uri = new DigestURL(entry.getKey());} catch (final MalformedURLException e) {uri = null;}
HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash()); HarvestProcess process = uri == null ? null : sb.crawlQueues.exists(uri.hash(), true);
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");

@ -378,37 +378,25 @@ public final class CrawlStacker {
*/ */
public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) { public String checkAcceptanceInitially(final DigestURL url, final CrawlProfile profile) {
final String urlstring = url.toString();
// check if the url is double registered // check if the url is double registered
final HarvestProcess dbocc = this.nextQueue.exists(url.hash(), false); // returns the name of the queue if entry exists
if (dbocc != null) {
return "double in: " + dbocc.name();
}
String urlhash = ASCII.String(url.hash()); String urlhash = ASCII.String(url.hash());
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash); final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate(); final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate();
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
if (oldDate == null) { if (oldDate == null) {
if (dbocc != null) { return null; // no evidence that we know that url
// do double-check }
if (dbocc == HarvestProcess.ERRORS) { final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + ")"; final String urlstring = url.toString();
} if (recrawl) {
return "double in: " + dbocc.toString(); if (CrawlStacker.log.isInfo())
} CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
} else { } else {
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime(); return "double in: LURL-DB, oldDate = " + oldDate.toString();
if (recrawl) {
if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
if (dbocc == null) {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
if (dbocc == HarvestProcess.ERRORS) {
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + (errorEntry == null ? "NULL" : errorEntry.getFailReason()));
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + "), oldDate = " + oldDate.toString();
}
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. ");
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();
}
} }
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
@ -416,7 +404,7 @@ public final class CrawlStacker {
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) { if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
final AtomicInteger dp = profile.getCount(url.getHost()); final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded (test by profile)"; return "crawl stack domain counter exceeded (test by profile)";
} }

@ -143,7 +143,7 @@ public class CrawlQueues {
* @param hash * @param hash
* @return if the hash exists, the name of the database is returned, otherwise null is returned * @return if the hash exists, the name of the database is returned, otherwise null is returned
*/ */
public HarvestProcess exists(final byte[] hash) { public HarvestProcess exists(final byte[] hash, final boolean checkErrorCache) {
if (this.delegatedURL.containsKey(ASCII.String(hash))) { if (this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED; return HarvestProcess.DELEGATED;
} }
@ -155,7 +155,7 @@ public class CrawlQueues {
return HarvestProcess.WORKER; return HarvestProcess.WORKER;
} }
} }
if (this.errorURL.exists(hash)) { if (checkErrorCache && this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS; return HarvestProcess.ERRORS;
} }
return null; return null;

@ -1616,7 +1616,7 @@ public final class Switchboard extends serverSwitch {
*/ */
public HarvestProcess urlExists(final String hash) { public HarvestProcess urlExists(final String hash) {
if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED; if (this.index.getLoadTime(hash) >= 0) return HarvestProcess.LOADED;
return this.crawlQueues.exists(ASCII.getBytes(hash)); return this.crawlQueues.exists(ASCII.getBytes(hash), true);
} }
public void urlRemove(final Segment segment, final byte[] hash) { public void urlRemove(final Segment segment, final byte[] hash) {

Loading…
Cancel
Save