reduced number of solr queries during crawling

11 years ago · b5d78ba156
parent 5326970d6c
commit b5d78ba156
4 changed files with 14 additions and 13 deletions
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -381,30 +381,28 @@ public final class CrawlStacker {
        final String urlstring = url.toString();
        // check if the url is double registered
        String urlhash = ASCII.String(url.hash());
+        final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
+        final Date oldDate = errorEntry == null ? null : errorEntry.getFailDate();
        final HarvestProcess dbocc = this.nextQueue.exists(url.hash()); // returns the name of the queue if entry exists
-        final long oldTime = this.indexSegment.fulltext().getLoadTime(urlhash);
-        if (oldTime < 0) {
+        if (oldDate == null) {
            if (dbocc != null) {
                // do double-check
                if (dbocc == HarvestProcess.ERRORS) {
-                    final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
                    return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + ")";
                }
                return "double in: " + dbocc.toString();
            }
        } else {
-            final boolean recrawl = profile.recrawlIfOlder() > oldTime;
+            final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
            if (recrawl) {
                if (CrawlStacker.log.isInfo())
                    CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
-                        ((System.currentTimeMillis() - oldTime) / 60000 / 60 / 24) + " days ago.");
+                        ((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago.");
            } else {
-                Date oldDate = new Date(oldTime);
                if (dbocc == null) {
                    return "double in: LURL-DB, oldDate = " + oldDate.toString();
                }
                if (dbocc == HarvestProcess.ERRORS) {
-                    final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
                    if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + (errorEntry == null ? "NULL" : errorEntry.getFailReason()));
                    return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + "), oldDate = " + oldDate.toString();
                }
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -147,9 +147,6 @@ public class CrawlQueues {
        if (this.delegatedURL.containsKey(ASCII.String(hash))) {
            return HarvestProcess.DELEGATED;
        }
-        if (this.errorURL.exists(hash)) {
-            return HarvestProcess.ERRORS;
-        }
        //if (this.noticeURL.existsInStack(hash)) {
        //    return HarvestProcess.CRAWLER;
        //} // this is disabled because it prevents proper crawling of smb shares. The cause is unknown
@ -158,6 +155,9 @@ public class CrawlQueues {
                return HarvestProcess.WORKER;
            }
        }
+        if (this.errorURL.exists(hash)) {
+            return HarvestProcess.ERRORS;
+        }
        return null;
    }
    
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -175,10 +175,10 @@ public class ErrorCache {
        }
        if (failDoc != null) return failDoc;
        try {
-            final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM, null, 0, 1);
-            if (docs == null || docs.isEmpty()) return null;
-            SolrDocument doc = docs.get(0);
+            final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
            if (doc == null) return null;
+            Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
+            if (failreason == null || failreason.toString().length() == 0) return null;
            return new CollectionConfiguration.FailDoc(doc);
        } catch (final IOException e) {
            ConcurrentLog.logException(e);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1587,6 +1587,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        public FailType getFailType() {
            return failType;
        }
+        public Date getFailDate() {
+            return this.failtime;
+        }
        public int getHttpstatus() {
            return httpstatus;
        }