diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 54e5ea915..df2eb4b58 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -74,7 +74,7 @@ function updatepage(str) {
Host/URL: - + #(delete)#::#(/delete)#
@@ -125,12 +125,12 @@ function updatepage(str) { #(type)#
Show Metadata
- #[url]#  + #[url]#  #(stored)# #(load)#link, detected from context::load & index#(/load)#:: indexed:: loading:: - load fail: #[error]# + #[error]# #(/stored)# :: diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index a78484684..927124ead 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -42,6 +42,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.DigestURI; @@ -315,8 +316,8 @@ public class HostBrowser { Map files = new HashMap(); for (String u: storedDocs) files.put(u, StoreType.INDEX); for (Map.Entry e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED); - for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, StoreType.LINK); - for (String u: loadingLinks) if (u.startsWith(path) && !storedDocs.contains(u)) files.put(u, StoreType.LINK); + for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK); + for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK); Log.logInfo("HostBrowser", "collected " + files.size() + " urls for path " + path); // distinguish files and folders @@ -391,15 +392,28 @@ public class HostBrowser { prop.put("files_list_" + c + "_type_url", entry.getKey()); StoreType type = (StoreType) entry.getValue(); try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;} - boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null); - prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : (type == StoreType.EXCLUDED || type == StoreType.FAILED) ? 3 : loading ? 2 : 0 /*linked*/); - prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); - if (type == StoreType.EXCLUDED || type == StoreType.FAILED) prop.put("files_list_" + c + "_type_stored_error", errorDocs.get(entry.getKey()).name()); - if (loadRight) { - prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey()); - prop.put("files_list_" + c + "_type_stored_load_path", path); + HarvestProcess process = uri == null ? null : sb.crawlQueues.urlExists(uri.hash()); + boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS); + boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED; + boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/"); + if (!dc) { + prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/); + prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0); + if (error) { + FailType failType = errorDocs.get(entry.getKey()); + if (failType == null) { + // maybe this is only in the errorURL + prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error"); + } else { + prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail"); + } + } + if (loadRight) { + prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey()); + prop.put("files_list_" + c + "_type_stored_load_path", path); + } + if (++c >= maxcount) break; } - if (++c >= maxcount) break; } } prop.put("files_list", c); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 7d84285ab..87b235279 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -446,16 +446,16 @@ public final class CrawlStacker { } // check if the url is double registered - final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists + final HarvestProcess dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); if (oldDate == null) { if (dbocc != null) { // do double-check - if (dbocc.equals("errors")) { + if (dbocc == HarvestProcess.ERRORS) { final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } - return "double in: " + dbocc; + return "double in: " + dbocc.toString(); } } else { final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime(); @@ -467,12 +467,12 @@ public final class CrawlStacker { if (dbocc == null) { return "double in: LURL-DB"; } - if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:"); - if (dbocc.equals("errors")) { + if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:"); + if (dbocc == HarvestProcess.ERRORS) { final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash()); return "double in: errors (" + errorEntry.anycause() + ")"; } - return "double in: " + dbocc; + return "double in: " + dbocc.toString(); } } diff --git a/source/net/yacy/crawler/HarvestProcess.java b/source/net/yacy/crawler/HarvestProcess.java new file mode 100644 index 000000000..0ddb76e9b --- /dev/null +++ b/source/net/yacy/crawler/HarvestProcess.java @@ -0,0 +1,27 @@ +/** + * HarvestProcess + * Copyright 2012 by Michael Peter Christen + * First released 06.12.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.crawler; + +public enum HarvestProcess { + + DELEGATED, ERRORS, CRAWLER, WORKER, LOADED; + +} diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 5e1bcee60..8a6bc5859 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -42,6 +42,7 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ConnectionInfo; +import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.ZURL.FailCategory; import net.yacy.crawler.retrieval.Request; @@ -142,19 +143,19 @@ public class CrawlQueues { * @param hash * @return if the hash exists, the name of the database is returned, otherwise null is returned */ - public String urlExists(final byte[] hash) { + public HarvestProcess urlExists(final byte[] hash) { if (this.delegatedURL.exists(hash)) { - return "delegated"; + return HarvestProcess.DELEGATED; } if (this.errorURL.exists(hash)) { - return "errors"; + return HarvestProcess.ERRORS; } if (this.noticeURL.existsInStack(hash)) { - return "crawler"; + return HarvestProcess.CRAWLER; } for (final Loader worker: this.workers.values()) { if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) { - return "worker"; + return HarvestProcess.WORKER; } } return null; diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 769cf1d65..031ead94e 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -34,6 +34,7 @@ import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.crawler.data.ZURL.FailCategory; @@ -170,10 +171,10 @@ public final class HTTPLoader { } // check if the url was already indexed - final String dbname = this.sb.urlExists(redirectionUrl.hash()); + final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash()); if (dbname != null) { // customer request this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode); - throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname); + throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString()); } // retry crawling with new url diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index f7f5dd132..ddbe84045 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -29,6 +29,7 @@ import java.net.MalformedURLException; import java.util.Date; import net.yacy.cora.document.ASCII; +import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.CrawlProfile; import net.yacy.document.parser.sitemapParser; import net.yacy.document.parser.sitemapParser.URLEntry; @@ -81,8 +82,8 @@ public class SitemapImporter extends Thread { // check if the url is known and needs to be recrawled Date lastMod = entry.lastmod(null); if (lastMod != null) { - final String dbocc = this.sb.urlExists(nexturlhash); - if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) { + final HarvestProcess dbocc = this.sb.urlExists(nexturlhash); + if (dbocc != null && dbocc == HarvestProcess.LOADED) { // the url was already loaded. we need to check the date final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); if (oldEntry != null) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d06c3363c..83a8ccc2d 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -114,6 +114,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.CrawlStacker; import net.yacy.crawler.CrawlSwitchboard; +import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; @@ -1510,11 +1511,11 @@ public final class Switchboard extends serverSwitch { return false; } - public String urlExists(final byte[] hash) { + public HarvestProcess urlExists(final byte[] hash) { // tests if hash occurrs in any database // if it exists, the name of the database is returned, // if it not exists, null is returned - if (this.index.exists(hash)) return "loaded"; + if (this.index.exists(hash)) return HarvestProcess.LOADED; return this.crawlQueues.urlExists(hash); }