fix for wrong display of error urls in HostBrowser

13 years ago · 10527e28ae
parent 756772fbd3
commit 10527e28ae
8 changed files with 75 additions and 30 deletions
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@ -74,7 +74,7 @@ function updatepage(str) {
    <form action="HostBrowser.html" id="searchform" method="get" onkeyup="xmlhttpPost(); return false;">
    <fieldset class="yacys">
      Host/URL: 
-      <input id="search" type="text" name="path" value="#[path]#" size="40" maxlength="250" />
+      <input id="search" type="text" name="path" value="#[path]#" size="80" maxlength="250" />
      <input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/>
      #(delete)#::<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>#(/delete)#
      <br />
@ -125,12 +125,12 @@ function updatepage(str) {
        #(type)#<!--file-->
        <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
          <td align="center"><div id="info"><a href="/ViewFile.html?url=#[url]#"><img src="/env/grafics/doc.gif"/></a><span>Show Metadata</span></div></td>
-          <td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#"><img src="/env/grafics/link.gif"/></a></td>
+          <td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#" target="_blank"><img src="/env/grafics/link.gif"/></a></td>
          #(stored)#
          #(load)#<td align="left" colspan="5" nowrap class="listingem">link, detected from context</td>::<td align="left" colspan="5" nowrap class="listingnok"><a href="/HostBrowser.html?load=#[url]#&path=#[path]#">load &amp; index</a>#(/load)#</td>::
          <td align="left" colspan="5" nowrap class="listingok">indexed</td>::
 		  <td align="left" colspan="5" nowrap class="pending">loading</td>::
- 		  <td align="left" colspan="5" nowrap class="listingnok">load fail: #[error]#</td>
+ 		  <td align="left" colspan="5" nowrap class="listingnok">#[error]#</td>
          #(/stored)#
        </tr>::<!--folder-->
        <tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -42,6 +42,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.sorting.ClusteredScoreMap;
 import net.yacy.cora.sorting.ReversibleScoreMap;
+import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.kelondro.data.meta.DigestURI;
@ -315,8 +316,8 @@ public class HostBrowser {
                Map<String, StoreType> files = new HashMap<String, StoreType>();
                for (String u: storedDocs) files.put(u, StoreType.INDEX);
                for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
-                for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, StoreType.LINK);
-                for (String u: loadingLinks) if (u.startsWith(path) && !storedDocs.contains(u)) files.put(u, StoreType.LINK);
+                for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK);
+                for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
                Log.logInfo("HostBrowser", "collected " + files.size() + " urls for path " + path);

                // distinguish files and folders
@ -391,15 +392,28 @@ public class HostBrowser {
                        prop.put("files_list_" + c + "_type_url", entry.getKey());
                        StoreType type = (StoreType) entry.getValue();
                        try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
-                        boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
-                        prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : (type == StoreType.EXCLUDED || type == StoreType.FAILED) ? 3 : loading ? 2 : 0 /*linked*/);
-                        prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
-                        if (type == StoreType.EXCLUDED || type == StoreType.FAILED) prop.put("files_list_" + c + "_type_stored_error", errorDocs.get(entry.getKey()).name());
-                        if (loadRight) {
-                            prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey());
-                            prop.put("files_list_" + c + "_type_stored_load_path", path);
+                        HarvestProcess process = uri == null ? null : sb.crawlQueues.urlExists(uri.hash());
+                        boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
+                        boolean error =  process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
+                        boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
+                        if (!dc) {
+                            prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
+                            prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
+                            if (error) {
+                                FailType failType = errorDocs.get(entry.getKey());
+                                if (failType == null) {
+                                    // maybe this is only in the errorURL
+                                    prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error");
+                                } else {
+                                    prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail");
+                                }
+                            }
+                            if (loadRight) {
+                                prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey());
+                                prop.put("files_list_" + c + "_type_stored_load_path", path);
+                            }
+                            if (++c >= maxcount) break;
                        }
-                        if (++c >= maxcount) break;
                    }
                }
                prop.put("files_list", c);
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -446,16 +446,16 @@ public final class CrawlStacker {
        }

        // check if the url is double registered
-        final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
+        final HarvestProcess dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
        final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
        if (oldDate == null) {
            if (dbocc != null) {
                // do double-check
-                if (dbocc.equals("errors")) {
+                if (dbocc == HarvestProcess.ERRORS) {
                    final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
                    return "double in: errors (" + errorEntry.anycause() + ")";
                }
-                return "double in: " + dbocc;
+                return "double in: " + dbocc.toString();
            }
        } else {
            final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
@ -467,12 +467,12 @@ public final class CrawlStacker {
                if (dbocc == null) {
                    return "double in: LURL-DB";
                }
-                if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
-                if (dbocc.equals("errors")) {
+                if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
+                if (dbocc == HarvestProcess.ERRORS) {
                    final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
                    return "double in: errors (" + errorEntry.anycause() + ")";
                }
-                return "double in: " + dbocc;
+                return "double in: " + dbocc.toString();
            }
        }

--- a/source/net/yacy/crawler/HarvestProcess.java
+++ b/source/net/yacy/crawler/HarvestProcess.java
@ -0,0 +1,27 @@
+/**
+ *  HarvestProcess
+ *  Copyright 2012 by Michael Peter Christen
+ *  First released 06.12.2012 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *  
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.crawler;
+
+public enum HarvestProcess {
+
+    DELEGATED, ERRORS, CRAWLER, WORKER, LOADED;
+    
+}
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -42,6 +42,7 @@ import net.yacy.cora.document.UTF8;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.protocol.ConnectionInfo;
+import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.data.ZURL.FailCategory;
 import net.yacy.crawler.retrieval.Request;
@ -142,19 +143,19 @@ public class CrawlQueues {
     * @param hash
     * @return if the hash exists, the name of the database is returned, otherwise null is returned
     */
-    public String urlExists(final byte[] hash) {
+    public HarvestProcess urlExists(final byte[] hash) {
        if (this.delegatedURL.exists(hash)) {
-            return "delegated";
+            return HarvestProcess.DELEGATED;
        }
        if (this.errorURL.exists(hash)) {
-            return "errors";
+            return HarvestProcess.ERRORS;
        }
        if (this.noticeURL.existsInStack(hash)) {
-            return "crawler";
+            return HarvestProcess.CRAWLER;
        }
        for (final Loader worker: this.workers.values()) {
            if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
-                return "worker";
+                return HarvestProcess.WORKER;
            }
        }
        return null;
--- a/source/net/yacy/crawler/retrieval/HTTPLoader.java
+++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java
@ -34,6 +34,7 @@ import net.yacy.cora.protocol.HeaderFramework;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.protocol.http.HTTPClient;
+import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.Latency;
 import net.yacy.crawler.data.ZURL.FailCategory;
@ -170,10 +171,10 @@ public final class HTTPLoader {
                    }

                    // check if the url was already indexed
-                    final String dbname = this.sb.urlExists(redirectionUrl.hash());
+                    final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash());
                    if (dbname != null) { // customer request
                        this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
-                        throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname);
+                        throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString());
                    }

                    // retry crawling with new url
--- a/source/net/yacy/crawler/retrieval/SitemapImporter.java
+++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java
@ -29,6 +29,7 @@ import java.net.MalformedURLException;
 import java.util.Date;

 import net.yacy.cora.document.ASCII;
+import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.document.parser.sitemapParser;
 import net.yacy.document.parser.sitemapParser.URLEntry;
@ -81,8 +82,8 @@ public class SitemapImporter extends Thread {
        // check if the url is known and needs to be recrawled
        Date lastMod = entry.lastmod(null);
        if (lastMod != null) {
-            final String dbocc = this.sb.urlExists(nexturlhash);
-            if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
+            final HarvestProcess dbocc = this.sb.urlExists(nexturlhash);
+            if (dbocc != null && dbocc == HarvestProcess.LOADED) {
                // the url was already loaded. we need to check the date
                final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
                if (oldEntry != null) {
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -114,6 +114,7 @@ import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.CrawlStacker;
 import net.yacy.crawler.CrawlSwitchboard;
+import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.data.CrawlQueues;
@ -1510,11 +1511,11 @@ public final class Switchboard extends serverSwitch {
        return false;
    }

-    public String urlExists(final byte[] hash) {
+    public HarvestProcess urlExists(final byte[] hash) {
        // tests if hash occurrs in any database
        // if it exists, the name of the database is returned,
        // if it not exists, null is returned
-        if (this.index.exists(hash)) return "loaded";
+        if (this.index.exists(hash)) return HarvestProcess.LOADED;
        return this.crawlQueues.urlExists(hash);
    }