From 3fd4a012861dfb8b7256f5412063a4ebb76043ee Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Tue, 26 Jun 2012 13:54:48 +0200
Subject: [PATCH] added option to record urls that are forwarded to the solr
 index

---
 defaults/solr.keys.list                       |  3 +-
 defaults/yacy.init                            |  1 +
 source/de/anomic/crawler/ZURL.java            | 20 +++--
 .../anomic/crawler/retrieval/HTTPLoader.java  | 78 ++++++++++---------
 .../net/yacy/search/SwitchboardConstants.java |  3 +-
 .../yacy/search/index/MetadataRepository.java | 48 +++++++++---
 6 files changed, 94 insertions(+), 59 deletions(-)
diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list
index fa5ca6e23..3394a7fd6 100644
--- a/defaults/solr.keys.list
+++ b/defaults/solr.keys.list
@@ -73,7 +73,7 @@ scriptscount_i
 ## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
 robots_i
 
-## html status return code (i.e. "200" for ok), -1 if not loaded, int
+## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int
 httpstatus_i
 
 ## content of <meta name="generator" content=#content#> tag, text
@@ -259,7 +259,6 @@ iframesscount_i
 ## number of matching title expressions, textgen
 #ext_title_val
 
-
 ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
 failreason_t
 
diff --git a/defaults/yacy.init b/defaults/yacy.init
index e9d67aaf3..390914d1f 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -700,6 +700,7 @@ crawler.http.acceptLanguage=en-us,en;q=0.5
 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
 crawler.http.maxFileSize=10485760
 crawler.http.FollowRedirects=true
+crawler.http.RecordRedirects=false
 
 # ftp crawler specific settings; size in bytes
 crawler.ftp.maxFileSize=10485760
diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java
index 603e517d5..83dc8798b 100644
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
@@ -36,9 +36,9 @@ import java.util.concurrent.LinkedBlockingQueue;
 
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.UTF8;
+import net.yacy.cora.services.federated.solr.ShardSolrConnector;
 import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.solr.SolrDoc;
-import net.yacy.cora.services.federated.solr.ShardSolrConnector;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.Index;
@@ -61,10 +61,17 @@ public class ZURL implements Iterable<ZURL.Entry> {
     public enum FailCategory {
         // TEMPORARY categories are such failure cases that should be tried again
         // FINAL categories are such failure cases that are final and should not be tried again
-        TEMPORARY_NETWORK_FAILURE, // an entity could not been loaded
-        FINAL_PROCESS_CONTEXT,     // because of a processing context we do not want that url again (i.e. remote crawling)
-        FINAL_LOAD_CONTEXT,        // the crawler configuration does not want to load the entity
-        FINAL_ROBOTS_RULE;         // a remote server denies indexing or loading
+        TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
+        FINAL_PROCESS_CONTEXT(false),    // because of a processing context we do not want that url again (i.e. remote crawling)
+        FINAL_LOAD_CONTEXT(false),       // the crawler configuration does not want to load the entity
+        FINAL_ROBOTS_RULE(true),         // a remote server denies indexing or loading
+        FINAL_REDIRECT_RULE(true);       // the remote server redirects this page, thus disallowing reading of content
+
+        public final boolean store;
+
+        private FailCategory(boolean store) {
+            this.store = store;
+        }
     }
 
     private final static Row rowdef = new Row(
@@ -153,6 +160,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
             String anycause,
             final int httpcode) {
         // assert executor != null; // null == proxy !
+        assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
         if (exists(bentry.url().hash())) return; // don't insert double causes
         if (anycause == null) anycause = "unknown";
         final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
@@ -160,7 +168,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
         put(entry);
         this.stack.add(entry.hash());
         Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
-        if (this.solrConnector != null && (failCategory == FailCategory.TEMPORARY_NETWORK_FAILURE || failCategory == FailCategory.FINAL_ROBOTS_RULE)) {
+        if (this.solrConnector != null && failCategory.store) {
             // send the error to solr
             try {
                 SolrDoc errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);
diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java
index 57b71434a..0d07ed5f0 100644
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@@ -79,8 +79,10 @@ public final class HTTPLoader {
 
     private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException {
 
+        byte[] myHash = this.sb.peers.mySeed().hash.getBytes();
+
         if (retryCount < 0) {
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
             throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
         }
 
@@ -96,7 +98,7 @@ public final class HTTPLoader {
         // check if url is in blacklist
         final String hostlow = host.toLowerCase();
         if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
             throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
         }
 
@@ -131,60 +133,60 @@ public final class HTTPLoader {
 
         // send request
     	final byte[] responseBody = client.GETbytes(url, maxFileSize);
-        final int code = client.getHttpResponse().getStatusLine().getStatusCode();
-    	final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
+        final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
+    	final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
+        String requestURLString = request.url().toNormalform(false, false);
 
-    	if (code > 299 && code < 310) {
-    		// redirection (content may be empty)
-    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
-                if (responseHeader.containsKey(HeaderFramework.LOCATION)) {
-                    // getting redirection URL
-                	String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
-                    redirectionUrlString = redirectionUrlString.trim();
+        // check redirection
+    	if (statusCode > 299 && statusCode < 310) {
 
-                    if (redirectionUrlString.length() == 0) {
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
-                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
-                    }
+    	    // read redirection URL
+            String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
+            redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
 
-                    // normalizing URL
-                    final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
+            if (redirectionUrlString.length() == 0) {
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
+                throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+            }
+
+            // normalize URL
+            final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
+
+            // restart crawling with new url
+            this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
+            this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
 
-                    // restart crawling with new url
-                    this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
-                    this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
+            if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
+            }
 
+    	    if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
                     // if we are already doing a shutdown we don't need to retry crawling
                     if (Thread.currentThread().isInterrupted()) {
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code);
-                        throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
+                        this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
+                        throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
                     }
 
                     // check if the url was already indexed
                     final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
-                    if (dbname != null) { //OTTO
-                        this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
-                        throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
+                    if (dbname != null) { // customer request
+                        this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
+                        throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname);
                     }
 
                     // retry crawling with new url
                     request.redirectURL(redirectionUrl);
                     return load(request, retryCount - 1, maxFileSize, checkBlacklist);
-                } else {
-                	// no redirection url provided
-                    this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code);
-                    throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
-                }
     	    } else {
     	        // we don't want to follow redirects
-                this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", code);
-                throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+                this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
+                throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
     	    }
         } else if (responseBody == null) {
     	    // no response, reject file
-            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code);
-            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
-    	} else if (code == 200 || code == 203) {
+            this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
+            throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
+    	} else if (statusCode == 200 || statusCode == 203) {
             // the transfer is ok
 
             // we write the new cache entry to file system directly
@@ -193,7 +195,7 @@ public final class HTTPLoader {
 
             // check length again in case it was not possible to get the length before loading
             if (maxFileSize > 0 && contentLength > maxFileSize) {
-            	this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);
+            	this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
             	throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
             }
 
@@ -211,8 +213,8 @@ public final class HTTPLoader {
             return response;
     	} else {
             // if the response has not the right response type then reject file
-        	this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code);
-            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
+        	this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
+            throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
         }
     }
 
diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java
index b59d3d395..5dc46dca8 100644
--- a/source/net/yacy/search/SwitchboardConstants.java
+++ b/source/net/yacy/search/SwitchboardConstants.java
@@ -296,7 +296,8 @@ public final class SwitchboardConstants {
      * <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
      */
     public static final String CRAWLER_THREADS_ACTIVE_MAX       = "crawler.MaxActiveThreads";
-    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects";
+    public static final String CRAWLER_FOLLOW_REDIRECTS         = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
+    public static final String CRAWLER_RECORD_REDIRECTS         = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
     public static final String YACY_MODE_DEBUG                  = "yacyDebugMode";
 
     /**
diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java
index 6ae2f93eb..b61e2ba9b 100644
--- a/source/net/yacy/search/index/MetadataRepository.java
+++ b/source/net/yacy/search/index/MetadataRepository.java
@@ -102,14 +102,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
         this.localSolr = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath,"defaults"), "solr"));
     }
 
-    public SolrConnector getRemoteSolr() {
-        return this.remoteSolr;
-    }
-
     public SolrConnector getLocalSolr() {
         return this.localSolr;
     }
 
+    public SolrConnector getRemoteSolr() {
+        return this.remoteSolr;
+    }
+
     public void clearCache() {
         if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
         if (this.statsDump != null) this.statsDump.clear();
@@ -137,8 +137,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
             this.urlIndexFile.close();
             this.urlIndexFile = null;
         }
-        if (this.remoteSolr != null) this.remoteSolr.close();
         if (this.localSolr != null) this.localSolr.close();
+        if (this.remoteSolr != null) this.remoteSolr.close();
     }
 
     public int writeCacheSize() {
@@ -208,10 +208,23 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
         if (MemoryControl.shortStatus()) clearCache() ;
     }
 
-    public boolean remove(final byte[] urlHashBytes) {
-        if (urlHashBytes == null) return false;
+    public boolean remove(final byte[] urlHash) {
+        if (urlHash == null) return false;
+        if (this.localSolr != null || this.remoteSolr != null) {
+            String urls = ASCII.String(urlHash);
+            if (this.localSolr != null) try {
+                this.localSolr.delete(urls);
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+            if (this.remoteSolr != null) try {
+                this.remoteSolr.delete(urls);
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+        }
         try {
-            final Row.Entry r = this.urlIndexFile.remove(urlHashBytes);
+            final Row.Entry r = this.urlIndexFile.remove(urlHash);
             if (r != null) this.statsDump = null;
             return r != null;
         } catch (final IOException e) {
@@ -221,11 +234,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
 
     public boolean exists(final byte[] urlHash) {
         if (urlHash == null) return false;
-        try {
-            if (this.remoteSolr != null && this.remoteSolr.exists(ASCII.String(urlHash))) {
-                return true;
+        if (this.localSolr != null || this.remoteSolr != null) {
+            String urls = ASCII.String(urlHash);
+            try {
+                if (this.localSolr != null && this.localSolr.exists(urls)) {
+                    return true;
+                }
+            } catch (final Throwable e) {
+                Log.logException(e);
+            }
+            try {
+                if (this.remoteSolr != null && this.remoteSolr.exists(urls)) {
+                    return true;
+                }
+            } catch (final Throwable e) {
+                Log.logException(e);
             }
-        } catch (final Throwable e) {
         }
         if (this.urlIndexFile == null) return false; // case may happen during shutdown
         return this.urlIndexFile.has(urlHash);