reduced number of solr queries during crawling

pull/1/head
Michael Peter Christen 10 years ago
parent 5326970d6c
commit b5d78ba156

@ -381,30 +381,28 @@ public final class CrawlStacker {
final String urlstring = url.toString();
// check if the url is double registered
String urlhash = ASCII.String(url.hash());
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate();
final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
final long oldTime = this.indexSegment.fulltext().getLoadTime(urlhash);
if (oldTime < 0) {
if (oldDate == null) {
if (dbocc != null) {
// do double-check
if (dbocc == HarvestProcess.ERRORS) {
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + ")";
}
return "double in: " + dbocc.toString();
}
} else {
final boolean recrawl = profile.recrawlIfOlder() > oldTime;
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
if (recrawl) {
if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
((System.currentTimeMillis() - oldTime) / 60000 / 60 / 24) + " days ago.");
((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
} else {
Date oldDate = new Date(oldTime);
if (dbocc == null) {
return "double in: LURL-DB, oldDate = " + oldDate.toString();
}
if (dbocc == HarvestProcess.ERRORS) {
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + (errorEntry == null ? "NULL" : errorEntry.getFailReason()));
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + "), oldDate = " + oldDate.toString();
}

@ -147,9 +147,6 @@ public class CrawlQueues {
if (this.delegatedURL.containsKey(ASCII.String(hash))) {
return HarvestProcess.DELEGATED;
}
if (this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS;
}
//if (this.noticeURL.existsInStack(hash)) {
// return HarvestProcess.CRAWLER;
//} // this is disabled because it prevents proper crawling of smb shares. The cause is unknown
@ -158,6 +155,9 @@ public class CrawlQueues {
return HarvestProcess.WORKER;
}
}
if (this.errorURL.exists(hash)) {
return HarvestProcess.ERRORS;
}
return null;
}

@ -175,10 +175,10 @@ public class ErrorCache {
}
if (failDoc != null) return failDoc;
try {
final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, null, 0, 1);
if (docs == null || docs.isEmpty()) return null;
SolrDocument doc = docs.get(0);
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
if (doc == null) return null;
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
if (failreason == null || failreason.toString().length() == 0) return null;
return new CollectionConfiguration.FailDoc(doc);
} catch (final IOException e) {
ConcurrentLog.logException(e);

@ -1587,6 +1587,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
public FailType getFailType() {
return failType;
}
public Date getFailDate() {
return this.failtime;
}
public int getHttpstatus() {
return httpstatus;
}

Loading…
Cancel
Save