From 3fd4a012861dfb8b7256f5412063a4ebb76043ee Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 26 Jun 2012 13:54:48 +0200 Subject: [PATCH] added option to record urls that are forwarded to the solr index --- defaults/solr.keys.list | 3 +- defaults/yacy.init | 1 + source/de/anomic/crawler/ZURL.java | 20 +++-- .../anomic/crawler/retrieval/HTTPLoader.java | 78 ++++++++++--------- .../net/yacy/search/SwitchboardConstants.java | 3 +- .../yacy/search/index/MetadataRepository.java | 48 +++++++++--- 6 files changed, 94 insertions(+), 59 deletions(-) diff --git a/defaults/solr.keys.list b/defaults/solr.keys.list index fa5ca6e23..3394a7fd6 100644 --- a/defaults/solr.keys.list +++ b/defaults/solr.keys.list @@ -73,7 +73,7 @@ scriptscount_i ## content of tag and the "X-Robots-Tag" HTTP property robots_i -## html status return code (i.e. "200" for ok), -1 if not loaded, int +## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int httpstatus_i ## content of tag, text @@ -259,7 +259,6 @@ iframesscount_i ## number of matching title expressions, textgen #ext_title_val - ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text failreason_t diff --git a/defaults/yacy.init b/defaults/yacy.init index e9d67aaf3..390914d1f 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -700,6 +700,7 @@ crawler.http.acceptLanguage=en-us,en;q=0.5 crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7 crawler.http.maxFileSize=10485760 crawler.http.FollowRedirects=true +crawler.http.RecordRedirects=false # ftp crawler specific settings; size in bytes crawler.ftp.maxFileSize=10485760 diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 603e517d5..83dc8798b 100644 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -36,9 +36,9 @@ import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.UTF8; +import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.solr.SolrDoc; -import net.yacy.cora.services.federated.solr.ShardSolrConnector; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.Index; @@ -61,10 +61,17 @@ public class ZURL implements Iterable { public enum FailCategory { // TEMPORARY categories are such failure cases that should be tried again // FINAL categories are such failure cases that are final and should not be tried again - TEMPORARY_NETWORK_FAILURE, // an entity could not been loaded - FINAL_PROCESS_CONTEXT, // because of a processing context we do not want that url again (i.e. remote crawling) - FINAL_LOAD_CONTEXT, // the crawler configuration does not want to load the entity - FINAL_ROBOTS_RULE; // a remote server denies indexing or loading + TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded + FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling) + FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity + FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading + FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content + + public final boolean store; + + private FailCategory(boolean store) { + this.store = store; + } } private final static Row rowdef = new Row( @@ -153,6 +160,7 @@ public class ZURL implements Iterable { String anycause, final int httpcode) { // assert executor != null; // null == proxy ! + assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name(); if (exists(bentry.url().hash())) return; // don't insert double causes if (anycause == null) anycause = "unknown"; final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : ""); @@ -160,7 +168,7 @@ public class ZURL implements Iterable { put(entry); this.stack.add(entry.hash()); Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason); - if (this.solrConnector != null && (failCategory == FailCategory.TEMPORARY_NETWORK_FAILURE || failCategory == FailCategory.FINAL_ROBOTS_RULE)) { + if (this.solrConnector != null && failCategory.store) { // send the error to solr try { SolrDoc errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 57b71434a..0d07ed5f0 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -79,8 +79,10 @@ public final class HTTPLoader { private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException { + byte[] myHash = this.sb.peers.mySeed().hash.getBytes(); + if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted."); } @@ -96,7 +98,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -131,60 +133,60 @@ public final class HTTPLoader { // send request final byte[] responseBody = client.GETbytes(url, maxFileSize); - final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); + final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + String requestURLString = request.url().toNormalform(false, false); - if (code > 299 && code < 310) { - // redirection (content may be empty) - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - if (responseHeader.containsKey(HeaderFramework.LOCATION)) { - // getting redirection URL - String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); - redirectionUrlString = redirectionUrlString.trim(); + // check redirection + if (statusCode > 299 && statusCode < 310) { - if (redirectionUrlString.length() == 0) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code); - throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty."); - } + // read redirection URL + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); + redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); - // normalizing URL - final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + if (redirectionUrlString.length() == 0) { + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); + throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + } + + // normalize URL + final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString)); + + // restart crawling with new url + this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); + this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl); - // restart crawling with new url - this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString()); - this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl); + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); + } + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code); - throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown."); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown."); } // check if the url was already indexed final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash()); - if (dbname != null) { //OTTO - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code); - throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname); + if (dbname != null) { // customer request + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode); + throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname); } // retry crawling with new url request.redirectURL(redirectionUrl); return load(request, retryCount - 1, maxFileSize, checkBlacklist); - } else { - // no redirection url provided - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code); - throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); - } } else { // we don't want to follow redirects - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", code); - throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } } else if (responseBody == null) { // no response, reject file - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code); - throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); - } else if (code == 200 || code == 203) { + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); + throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); + } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok // we write the new cache entry to file system directly @@ -193,7 +195,7 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize > 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); } @@ -211,8 +213,8 @@ public final class HTTPLoader { return response; } else { // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code); - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString); } } diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index b59d3d395..5dc46dca8 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -296,7 +296,8 @@ public final class SwitchboardConstants { *

Name of the setting how many active crawler-threads may maximal be running on the same time

*/ public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"; - public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; + public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect + public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store public static final String YACY_MODE_DEBUG = "yacyDebugMode"; /** diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index 6ae2f93eb..b61e2ba9b 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -102,14 +102,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.localSolr = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath,"defaults"), "solr")); } - public SolrConnector getRemoteSolr() { - return this.remoteSolr; - } - public SolrConnector getLocalSolr() { return this.localSolr; } + public SolrConnector getRemoteSolr() { + return this.remoteSolr; + } + public void clearCache() { if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); @@ -137,8 +137,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable this.urlIndexFile.close(); this.urlIndexFile = null; } - if (this.remoteSolr != null) this.remoteSolr.close(); if (this.localSolr != null) this.localSolr.close(); + if (this.remoteSolr != null) this.remoteSolr.close(); } public int writeCacheSize() { @@ -208,10 +208,23 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable if (MemoryControl.shortStatus()) clearCache() ; } - public boolean remove(final byte[] urlHashBytes) { - if (urlHashBytes == null) return false; + public boolean remove(final byte[] urlHash) { + if (urlHash == null) return false; + if (this.localSolr != null || this.remoteSolr != null) { + String urls = ASCII.String(urlHash); + if (this.localSolr != null) try { + this.localSolr.delete(urls); + } catch (final Throwable e) { + Log.logException(e); + } + if (this.remoteSolr != null) try { + this.remoteSolr.delete(urls); + } catch (final Throwable e) { + Log.logException(e); + } + } try { - final Row.Entry r = this.urlIndexFile.remove(urlHashBytes); + final Row.Entry r = this.urlIndexFile.remove(urlHash); if (r != null) this.statsDump = null; return r != null; } catch (final IOException e) { @@ -221,11 +234,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable public boolean exists(final byte[] urlHash) { if (urlHash == null) return false; - try { - if (this.remoteSolr != null && this.remoteSolr.exists(ASCII.String(urlHash))) { - return true; + if (this.localSolr != null || this.remoteSolr != null) { + String urls = ASCII.String(urlHash); + try { + if (this.localSolr != null && this.localSolr.exists(urls)) { + return true; + } + } catch (final Throwable e) { + Log.logException(e); + } + try { + if (this.remoteSolr != null && this.remoteSolr.exists(urls)) { + return true; + } + } catch (final Throwable e) { + Log.logException(e); } - } catch (final Throwable e) { } if (this.urlIndexFile == null) return false; // case may happen during shutdown return this.urlIndexFile.has(urlHash);