fix for wrong display of error urls in HostBrowser

pull/1/head
Michael Peter Christen 12 years ago
parent 756772fbd3
commit 10527e28ae

@ -74,7 +74,7 @@ function updatepage(str) {
<form action="HostBrowser.html" id="searchform" method="get" onkeyup="xmlhttpPost(); return false;">
<fieldset class="yacys">
Host/URL:
<input id="search" type="text" name="path" value="#[path]#" size="40" maxlength="250" />
<input id="search" type="text" name="path" value="#[path]#" size="80" maxlength="250" />
<input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/>
#(delete)#::<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>#(/delete)#
<br />
@ -125,12 +125,12 @@ function updatepage(str) {
#(type)#<!--file-->
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="center"><div id="info"><a href="/ViewFile.html?url=#[url]#"><img src="/env/grafics/doc.gif"/></a><span>Show Metadata</span></div></td>
<td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#"><img src="/env/grafics/link.gif"/></a></td>
<td align="left" nowrap class=#(stored)#"listingem"::"listing"#(/stored)#>#[url]#&nbsp;<a href="#[url]#" target="_blank"><img src="/env/grafics/link.gif"/></a></td>
#(stored)#
#(load)#<td align="left" colspan="5" nowrap class="listingem">link, detected from context</td>::<td align="left" colspan="5" nowrap class="listingnok"><a href="/HostBrowser.html?load=#[url]#&path=#[path]#">load &amp; index</a>#(/load)#</td>::
<td align="left" colspan="5" nowrap class="listingok">indexed</td>::
<td align="left" colspan="5" nowrap class="pending">loading</td>::
<td align="left" colspan="5" nowrap class="listingnok">load fail: #[error]#</td>
<td align="left" colspan="5" nowrap class="listingnok">#[error]#</td>
#(/stored)#
</tr>::<!--folder-->
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">

@ -42,6 +42,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.DigestURI;
@ -315,8 +316,8 @@ public class HostBrowser {
Map<String, StoreType> files = new HashMap<String, StoreType>();
for (String u: storedDocs) files.put(u, StoreType.INDEX);
for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
for (String u: inboundLinks) if (!storedDocs.contains(u)) files.put(u, StoreType.LINK);
for (String u: loadingLinks) if (u.startsWith(path) && !storedDocs.contains(u)) files.put(u, StoreType.LINK);
for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK);
for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
Log.logInfo("HostBrowser", "collected " + files.size() + " urls for path " + path);
// distinguish files and folders
@ -391,15 +392,28 @@ public class HostBrowser {
prop.put("files_list_" + c + "_type_url", entry.getKey());
StoreType type = (StoreType) entry.getValue();
try {uri = new DigestURI(entry.getKey());} catch (MalformedURLException e) {uri = null;}
boolean loading = load.equals(entry.getKey()) || (uri != null && sb.crawlQueues.urlExists(uri.hash()) != null);
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : (type == StoreType.EXCLUDED || type == StoreType.FAILED) ? 3 : loading ? 2 : 0 /*linked*/);
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (type == StoreType.EXCLUDED || type == StoreType.FAILED) prop.put("files_list_" + c + "_type_stored_error", errorDocs.get(entry.getKey()).name());
if (loadRight) {
prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey());
prop.put("files_list_" + c + "_type_stored_load_path", path);
HarvestProcess process = uri == null ? null : sb.crawlQueues.urlExists(uri.hash());
boolean loading = load.equals(entry.getKey()) || (process != null && process != HarvestProcess.ERRORS);
boolean error = process == HarvestProcess.ERRORS || type == StoreType.EXCLUDED || type == StoreType.FAILED;
boolean dc = type != StoreType.INDEX && !error && !loading && list.containsKey(entry.getKey() + "/");
if (!dc) {
prop.put("files_list_" + c + "_type_stored", type == StoreType.INDEX ? 1 : error ? 3 : loading ? 2 : 0 /*linked*/);
prop.put("files_list_" + c + "_type_stored_load", loadRight ? 1 : 0);
if (error) {
FailType failType = errorDocs.get(entry.getKey());
if (failType == null) {
// maybe this is only in the errorURL
prop.put("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(uri.hash()).anycause() : "unknown error");
} else {
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail");
}
}
if (loadRight) {
prop.put("files_list_" + c + "_type_stored_load_url", entry.getKey());
prop.put("files_list_" + c + "_type_stored_load_path", path);
}
if (++c >= maxcount) break;
}
if (++c >= maxcount) break;
}
}
prop.put("files_list", c);

@ -446,16 +446,16 @@ public final class CrawlStacker {
}
// check if the url is double registered
final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final HarvestProcess dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists
final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash()));
if (oldDate == null) {
if (dbocc != null) {
// do double-check
if (dbocc.equals("errors")) {
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
}
return "double in: " + dbocc;
return "double in: " + dbocc.toString();
}
} else {
final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime();
@ -467,12 +467,12 @@ public final class CrawlStacker {
if (dbocc == null) {
return "double in: LURL-DB";
}
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc + "'. " + "Stack processing time:");
if (dbocc.equals("errors")) {
if (this.log.isInfo()) this.log.logInfo("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. " + "Stack processing time:");
if (dbocc == HarvestProcess.ERRORS) {
final ZURL.Entry errorEntry = this.nextQueue.errorURL.get(url.hash());
return "double in: errors (" + errorEntry.anycause() + ")";
}
return "double in: " + dbocc;
return "double in: " + dbocc.toString();
}
}

@ -0,0 +1,27 @@
/**
* HarvestProcess
* Copyright 2012 by Michael Peter Christen
* First released 06.12.2012 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.crawler;
public enum HarvestProcess {
DELEGATED, ERRORS, CRAWLER, WORKER, LOADED;
}

@ -42,6 +42,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.data.ZURL.FailCategory;
import net.yacy.crawler.retrieval.Request;
@ -142,19 +143,19 @@ public class CrawlQueues {
* @param hash
* @return if the hash exists, the name of the database is returned, otherwise null is returned
*/
public String urlExists(final byte[] hash) {
public HarvestProcess urlExists(final byte[] hash) {
if (this.delegatedURL.exists(hash)) {
return "delegated";
return HarvestProcess.DELEGATED;
}
if (this.errorURL.exists(hash)) {
return "errors";
return HarvestProcess.ERRORS;
}
if (this.noticeURL.existsInStack(hash)) {
return "crawler";
return HarvestProcess.CRAWLER;
}
for (final Loader worker: this.workers.values()) {
if (Base64Order.enhancedCoder.equal(worker.request.url().hash(), hash)) {
return "worker";
return HarvestProcess.WORKER;
}
}
return null;

@ -34,6 +34,7 @@ import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.Latency;
import net.yacy.crawler.data.ZURL.FailCategory;
@ -170,10 +171,10 @@ public final class HTTPLoader {
}
// check if the url was already indexed
final String dbname = this.sb.urlExists(redirectionUrl.hash());
final HarvestProcess dbname = this.sb.urlExists(redirectionUrl.hash());
if (dbname != null) { // customer request
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname.toString());
}
// retry crawling with new url

@ -29,6 +29,7 @@ import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.ASCII;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.document.parser.sitemapParser;
import net.yacy.document.parser.sitemapParser.URLEntry;
@ -81,8 +82,8 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
final String dbocc = this.sb.urlExists(nexturlhash);
if ((dbocc != null) && (dbocc.equalsIgnoreCase("loaded"))) {
final HarvestProcess dbocc = this.sb.urlExists(nexturlhash);
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {

@ -114,6 +114,7 @@ import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlStacker;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
@ -1510,11 +1511,11 @@ public final class Switchboard extends serverSwitch {
return false;
}
public String urlExists(final byte[] hash) {
public HarvestProcess urlExists(final byte[] hash) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if (this.index.exists(hash)) return "loaded";
if (this.index.exists(hash)) return HarvestProcess.LOADED;
return this.crawlQueues.urlExists(hash);
}

Loading…
Cancel
Save