From 10cf8215bd5aa5099e780c2703184fc7d5779f14 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 17 Apr 2014 13:21:43 +0200 Subject: [PATCH] added crawl depth for failed documents --- htroot/Crawler_p.java | 2 +- htroot/HostBrowser.java | 4 ++-- htroot/yacy/crawlReceipt.java | 2 +- source/net/yacy/crawler/CrawlStacker.java | 2 +- source/net/yacy/crawler/data/CrawlQueues.java | 6 +++--- .../net/yacy/crawler/retrieval/FTPLoader.java | 2 +- .../net/yacy/crawler/retrieval/HTTPLoader.java | 18 +++++++++--------- .../net/yacy/repository/LoaderDispatcher.java | 2 +- source/net/yacy/search/Switchboard.java | 18 +++++++++--------- .../net/yacy/search/index/DocumentIndex.java | 2 +- source/net/yacy/search/index/ErrorCache.java | 4 ++-- .../search/schema/CollectionConfiguration.java | 6 +++++- .../net/yacy/search/snippet/MediaSnippet.java | 2 +- 13 files changed, 37 insertions(+), 33 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 691af66c4..544ed68be 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -475,7 +475,7 @@ public class Crawler_p { } else { StringBuilder fr = new StringBuilder(); for (Map.Entry failure: failurls.entrySet()) { - sb.crawlQueues.errorURL.push(failure.getKey(), null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1); + sb.crawlQueues.errorURL.push(failure.getKey(), 0, null, FailCategory.FINAL_LOAD_CONTEXT, failure.getValue(), -1); fr.append(failure.getValue()).append('/'); } diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index fcd674b8d..95ed59cff 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -573,7 +573,7 @@ public class HostBrowser { Integer rc_internal = (Integer) doc.getFieldValue(CollectionSchema.references_internal_i.getSolrFieldName()); Integer rc_external = (Integer) doc.getFieldValue(CollectionSchema.references_external_i.getSolrFieldName()); Integer rc_exthosts = (Integer) doc.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); - this.crawldepth = (cr == null || cr.intValue() < 0) ? 999 : cr.intValue(); + this.crawldepth = (cr == null || cr.intValue() < 0) ? 998 : cr.intValue(); this.references = (rc == null || rc.intValue() <= 0) ? 0 : rc.intValue(); this.references_internal = (rc_internal == null || rc_internal.intValue() <= 0) ? 0 : rc_internal.intValue(); // calculate the url reference list @@ -625,7 +625,7 @@ public class HostBrowser { } if (sbe.length() > 0) sbe.insert(0, "
external referrer:
"); return - (this.crawldepth >= 0 ? "crawldepth: " + this.crawldepth : "") + + (this.crawldepth == 998 ? "unknown crawldepth" : this.crawldepth >= 0 ? "crawldepth: " + this.crawldepth : "") + (this.cr_c != null ? ", cr=" + (Math.round(this.cr_c * 1000.0d) / 1000.0d) : "") + (this.cr_n != null ? ", crn=" + this.cr_n : "") + (this.references >= 0 ? ", refs: " + this.references_exthosts + " hosts, " + this.references_external + " ext, " + this.references_internal + " int" + sbi.toString() + sbe.toString() : ""); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index e11b347df..1b175b121 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -160,7 +160,7 @@ public final class crawlReceipt { } sb.crawlQueues.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case - sb.crawlQueues.errorURL.push(entry.url(), null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); + sb.crawlQueues.errorURL.push(entry.url(), 997, null, FailCategory.FINAL_LOAD_CONTEXT, result + ":" + reason, -1); //switchboard.noticeURL.remove(receivedUrlhash); prop.put("delay", "3600"); return prop; diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index fd75c990e..6a728a415 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -150,7 +150,7 @@ public final class CrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null && !rejectReason.startsWith("double in")) { final CrawlProfile profile = this.crawler.get(UTF8.getBytes(entry.profileHandle())); - this.nextQueue.errorURL.push(entry.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); + this.nextQueue.errorURL.push(entry.url(), entry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } catch (final Exception e) { CrawlStacker.log.warn("Error while processing stackCrawl entry.\n" + "Entry: " + entry.toString() + "Error: " + e.toString(), e); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 7d66b039b..31154fa3c 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -640,7 +640,7 @@ public class CrawlQueues { (robotsEntry = CrawlQueues.this.sb.robots.getEntry(request.url(), profile.getAgent())) != null && robotsEntry.isDisallowed(request.url())) { //if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); - CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1); + CrawlQueues.this.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_ROBOTS_RULE, "denied by robots.txt", -1); request.setStatus("worker-disallowed", WorkflowJob.STATUS_FINISHED); } else { // starting a load from the internet @@ -678,7 +678,7 @@ public class CrawlQueues { // thus we only push this message if we don't have that mark error = error.substring(0, error.length() - 1).trim(); } else { - CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1); + CrawlQueues.this.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "cannot load: " + error, -1); } request.setStatus("worker-error", WorkflowJob.STATUS_FINISHED); } else { @@ -686,7 +686,7 @@ public class CrawlQueues { } } } catch (final Exception e) { - CrawlQueues.this.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1); + CrawlQueues.this.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, e.getMessage() + " - in worker", -1); ConcurrentLog.logException(e); request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); } finally { diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 02e4b371b..ae2b893dc 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -156,7 +156,7 @@ public class FTPLoader { if (berr.size() > 0 || response == null) { // some error logging final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : ""; - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, " ftp server download, " + detail, -1); throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); } diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index f132c5c1d..a00b563af 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -77,7 +77,7 @@ public final class HTTPLoader { private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); } @@ -93,7 +93,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); } @@ -140,7 +140,7 @@ public final class HTTPLoader { redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); if (redirectionUrlString.isEmpty()) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } @@ -154,13 +154,13 @@ public final class HTTPLoader { this.sb.webStructure.generateCitationReference(url, redirectionUrl); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); } if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.$"); } @@ -169,11 +169,11 @@ public final class HTTPLoader { return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); } // we don't want to follow redirects - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (responseBody == null) { // no response, reject file - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok @@ -184,7 +184,7 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); } @@ -201,7 +201,7 @@ public final class HTTPLoader { return response; } else { // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request.url(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 8c631e904..ca15d13fb 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -190,7 +190,7 @@ public final class LoaderDispatcher { // check if url is in blacklist if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { - this.sb.crawlQueues.errorURL.push(request.url(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 63a07ce98..a7705abdd 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1816,7 +1816,7 @@ public final class Switchboard extends serverSwitch { // log cause and close queue //if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason, -1); + this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, noIndexReason, -1); // finish this entry return "not allowed: " + noIndexReason; } @@ -2530,7 +2530,7 @@ public final class Switchboard extends serverSwitch { if ( response.getContent() == null ) { this.log.warn("the resource '" + response.url() + "' is missing in the cache."); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache", -1); + this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache", -1); return null; } } @@ -2550,7 +2550,7 @@ public final class Switchboard extends serverSwitch { } catch (final Parser.Failure e ) { this.log.warn("Unable to parse the resource '" + response.url() + "'. " + e.getMessage()); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, e.getMessage(), -1); + this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, e.getMessage(), -1); return null; } final long parsingEndTime = System.currentTimeMillis(); @@ -2570,7 +2570,7 @@ public final class Switchboard extends serverSwitch { } else { // we consider this as fail urls to have a tracking of the problem if (rejectReason != null && !rejectReason.startsWith("double in")) { - this.crawlStacker.nextQueue.errorURL.push(response.url(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); + this.crawlStacker.nextQueue.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_LOAD_CONTEXT, rejectReason, -1); } } } @@ -2678,7 +2678,7 @@ public final class Switchboard extends serverSwitch { (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } @@ -2688,14 +2688,14 @@ public final class Switchboard extends serverSwitch { if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); continue docloop; } if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) || (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(in.queueEntry.url(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); continue docloop; } doclist.add(document); @@ -2783,14 +2783,14 @@ public final class Switchboard extends serverSwitch { if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase, -1); + this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by rule in document, process case=" + processCase, -1); return; } if ( profile != null && !profile.indexText() && !profile.indexMedia() ) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(url, profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "denied by profile rule, process case=" + processCase + ", profile name = " + profile.collectionName(), -1); diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index a62338e1b..03161e768 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -149,7 +149,7 @@ public class DocumentIndex extends Segment { length = -1; } try { - documents = TextParser.parseSource(url, null, null, 999, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); + documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); } diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index a6eee9b2f..6796e5a76 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -99,7 +99,7 @@ public class ErrorCache { } } - public void push(final DigestURL url, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) { + public void push(final DigestURL url, final int crawldepth, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) { // assert executor != null; // null == proxy ! assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name(); if (exists(url.hash())) @@ -110,7 +110,7 @@ public class ErrorCache { CollectionConfiguration.FailDoc failDoc = new CollectionConfiguration.FailDoc( url, profile == null ? null : profile.collections(), failCategory.name() + " " + reason, failCategory.failType, - httpcode); + httpcode, crawldepth); if (this.fulltext.getDefaultConnector() != null && failCategory.store) { // send the error to solr try { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 1fc0b9d31..66ca14b26 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1478,13 +1478,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final FailType failType; final int httpstatus; final Date failtime; - public FailDoc(final DigestURL digestURL, final Map collections, final String failReason, final FailType failType, final int httpstatus) { + final int crawldepth; + public FailDoc(final DigestURL digestURL, final Map collections, final String failReason, final FailType failType, final int httpstatus, final int crawldepth) { this.digestURL = digestURL; this.collections = collections; this.failReason = failReason; this.failType = failType; this.httpstatus = httpstatus; this.failtime = new Date(); + this.crawldepth = crawldepth; } public FailDoc(final SolrDocument doc) { try { @@ -1501,6 +1503,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri this.failType = fts == null ? FailType.fail : FailType.valueOf(fts); this.httpstatus = (Integer) doc.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()); this.failtime = (Date) doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName()); + this.crawldepth = (Integer) doc.getFieldValue(CollectionSchema.crawldepth_i.getSolrFieldName()); } public DigestURL getDigestURL() { return digestURL; @@ -1524,6 +1527,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final SolrInputDocument doc = new SolrInputDocument(); String url = configuration.addURIAttributes(doc, allAttr, this.getDigestURL(), Response.docType(this.getDigestURL())); if (allAttr || configuration.contains(CollectionSchema.load_date_dt)) configuration.add(doc, CollectionSchema.load_date_dt, new Date()); + if (allAttr || configuration.contains(CollectionSchema.crawldepth_i)) configuration.add(doc, CollectionSchema.crawldepth_i, this.crawldepth); // fail reason and status if (allAttr || configuration.contains(CollectionSchema.failreason_s)) configuration.add(doc, CollectionSchema.failreason_s, this.getFailReason()); diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 4cffbfdfa..b2d2fc2c2 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -261,7 +261,7 @@ public class MediaSnippet implements Comparable, Comparator