added option to record urls that are forwarded to the solr index

13 years ago · 3fd4a01286
parent d763e4d94b
commit 3fd4a01286
6 changed files with 94 additions and 59 deletions
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@ -73,7 +73,7 @@ scriptscount_i
 ## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
 robots_i

-## html status return code (i.e. "200" for ok), -1 if not loaded, int
+## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int
 httpstatus_i

 ## content of <meta name="generator" content=#content#> tag, text
@ -259,7 +259,6 @@ iframesscount_i
 ## number of matching title expressions, textgen
 #ext_title_val

-
 ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
 failreason_t

--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -700,6 +700,7 @@ crawler.http.acceptLanguage=en-us,en;q=0.5
 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
 crawler.http.maxFileSize=10485760
 crawler.http.FollowRedirects=true
+crawler.http.RecordRedirects=false

 # ftp crawler specific settings; size in bytes
 crawler.ftp.maxFileSize=10485760
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
@ -36,9 +36,9 @@ import java.util.concurrent.LinkedBlockingQueue;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.UTF8;
+import net.yacy.cora.services.federated.solr.ShardSolrConnector;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.solr.SolrDoc;
-import net.yacy.cora.services.federated.solr.ShardSolrConnector;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.Index;
@ -61,10 +61,17 @@ public class ZURL implements Iterable<ZURL.Entry> {
    public enum FailCategory {
        // TEMPORARY categories are such failure cases that should be tried again
        // FINAL categories are such failure cases that are final and should not be tried again
-        TEMPORARY_NETWORK_FAILURE, // an entity could not been loaded
-        FINAL_PROCESS_CONTEXT,     // because of a processing context we do not want that url again (i.e. remote crawling)
-        FINAL_LOAD_CONTEXT,        // the crawler configuration does not want to load the entity
-        FINAL_ROBOTS_RULE;         // a remote server denies indexing or loading
+        TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
+        FINAL_PROCESS_CONTEXT(false),    // because of a processing context we do not want that url again (i.e. remote crawling)
+        FINAL_LOAD_CONTEXT(false),       // the crawler configuration does not want to load the entity
+        FINAL_ROBOTS_RULE(true),         // a remote server denies indexing or loading
+        FINAL_REDIRECT_RULE(true);       // the remote server redirects this page, thus disallowing reading of content
+
+        public final boolean store;
+
+        private FailCategory(boolean store) {
+            this.store = store;
+        }
    }

    private final static Row rowdef = new Row(
@ -153,6 +160,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
            String anycause,
            final int httpcode) {
        // assert executor != null; // null == proxy !
+        assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
        if (exists(bentry.url().hash())) return; // don't insert double causes
        if (anycause == null) anycause = "unknown";
        final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
@ -160,7 +168,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
        put(entry);
        this.stack.add(entry.hash());
        Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
-        if (this.solrConnector != null && (failCategory == FailCategory.TEMPORARY_NETWORK_FAILURE || failCategory == FailCategory.FINAL_ROBOTS_RULE)) {
+        if (this.solrConnector != null && failCategory.store) {
            // send the error to solr
            try {
                SolrDoc errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -79,8 +79,10 @@ public final class HTTPLoader {

    private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException {

+        byte[] myHash = this.sb.peers.mySeed().hash.getBytes();
+
        if (retryCount < 0) {
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
            throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
        }

@ -96,7 +98,7 @@ public final class HTTPLoader {
        // check if url is in blacklist
        final String hostlow = host.toLowerCase();
        if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
        }

@ -131,60 +133,60 @@ public final class HTTPLoader {

        // send request
    	final byte[] responseBody = client.GETbytes(url, maxFileSize);
-        final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-    	final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
+        final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
+    	final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        String requestURLString = request.url().toNormalform(false, false);

-    	if (code > 299 && code < 310) {
-    		// redirection (content may be empty)
-    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-                if (responseHeader.containsKey(HeaderFramework.LOCATION)) {
-                    // getting redirection URL
-                	String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
-                    redirectionUrlString = redirectionUrlString.trim();
+        // check redirection
+    	if (statusCode > 299 && statusCode < 310) {

-                    if (redirectionUrlString.length() == 0) {
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
-                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
-                    }
+    	    // read redirection URL
+            String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
+            redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();

-                    // normalizing URL
-                    final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
+            if (redirectionUrlString.length() == 0) {
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
+                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            }
+
+            // normalize URL
+            final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
+
+            // restart crawling with new url
+            this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
+            this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);

-                    // restart crawling with new url
-                    this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
-                    this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
+            }

+    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
                    // if we are already doing a shutdown we don't need to retry crawling
                    if (Thread.currentThread().isInterrupted()) {
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code);
-                        throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
+                        this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
+                        throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
                    }

                    // check if the url was already indexed
                    final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
-                    if (dbname != null) { //OTTO
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
-                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
+                    if (dbname != null) { // customer request
+                        this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
+                        throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname);
                    }

                    // retry crawling with new url
                    request.redirectURL(redirectionUrl);
                    return load(request, retryCount - 1, maxFileSize, checkBlacklist);
-                } else {
-                	// no redirection url provided
-                    this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code);
-                    throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
-                }
    	    } else {
    	        // we don't want to follow redirects
-                this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", code);
-                throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+                throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
    	    }
        } else if (responseBody == null) {
    	    // no response, reject file
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code);
-            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
-    	} else if (code == 200 || code == 203) {
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
+            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+    	} else if (statusCode == 200 || statusCode == 203) {
            // the transfer is ok

            // we write the new cache entry to file system directly
@ -193,7 +195,7 @@ public final class HTTPLoader {

            // check length again in case it was not possible to get the length before loading
            if (maxFileSize > 0 && contentLength > maxFileSize) {
-            	this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);
+            	this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
            	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
            }

@ -211,8 +213,8 @@ public final class HTTPLoader {
            return response;
    	} else {
            // if the response has not the right response type then reject file
-        	this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code);
-            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+        	this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
        }
    }

--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@ -296,7 +296,8 @@ public final class SwitchboardConstants {
     * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
     */
    public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
-    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects";
+    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
+    public static final String CRAWLER_RECORD_REDIRECTS         = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
    public static final String YACY_MODE_DEBUG                  = "yacyDebugMode";

    /**
--- a/source/net/yacy/search/index/MetadataRepository.java
+++ b/source/net/yacy/search/index/MetadataRepository.java
@ -102,14 +102,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
        this.localSolr = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath,"defaults"), "solr"));
    }

-    public SolrConnector getRemoteSolr() {
-        return this.remoteSolr;
-    }
-
    public SolrConnector getLocalSolr() {
        return this.localSolr;
    }

+    public SolrConnector getRemoteSolr() {
+        return this.remoteSolr;
+    }
+
    public void clearCache() {
        if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
        if (this.statsDump != null) this.statsDump.clear();
@ -137,8 +137,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
            this.urlIndexFile.close();
            this.urlIndexFile = null;
        }
-        if (this.remoteSolr != null) this.remoteSolr.close();
        if (this.localSolr != null) this.localSolr.close();
+        if (this.remoteSolr != null) this.remoteSolr.close();
    }

    public int writeCacheSize() {
@ -208,10 +208,23 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
        if (MemoryControl.shortStatus()) clearCache() ;
    }

-    public boolean remove(final byte[] urlHashBytes) {
-        if (urlHashBytes == null) return false;
+    public boolean remove(final byte[] urlHash) {
+        if (urlHash == null) return false;
+        if (this.localSolr != null || this.remoteSolr != null) {
+            String urls = ASCII.String(urlHash);
+            if (this.localSolr != null) try {
+                this.localSolr.delete(urls);
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+            if (this.remoteSolr != null) try {
+                this.remoteSolr.delete(urls);
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+        }
        try {
-            final Row.Entry r = this.urlIndexFile.remove(urlHashBytes);
+            final Row.Entry r = this.urlIndexFile.remove(urlHash);
            if (r != null) this.statsDump = null;
            return r != null;
        } catch (final IOException e) {
@ -221,11 +234,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>

    public boolean exists(final byte[] urlHash) {
        if (urlHash == null) return false;
-        try {
-            if (this.remoteSolr != null && this.remoteSolr.exists(ASCII.String(urlHash))) {
-                return true;
+        if (this.localSolr != null || this.remoteSolr != null) {
+            String urls = ASCII.String(urlHash);
+            try {
+                if (this.localSolr != null && this.localSolr.exists(urls)) {
+                    return true;
+                }
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+            try {
+                if (this.remoteSolr != null && this.remoteSolr.exists(urls)) {
+                    return true;
+                }
+            } catch (final Throwable e) {
+                Log.logException(e);
            }
-        } catch (final Throwable e) {
        }
        if (this.urlIndexFile == null) return false; // case may happen during shutdown
        return this.urlIndexFile.has(urlHash);