diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html index 0f68d2d4b..f4c78e02f 100644 --- a/htroot/HostBrowser.html +++ b/htroot/HostBrowser.html @@ -75,8 +75,11 @@ function updatepage(str) {
Host/URL: - - #(delete)#::#(/delete)# + + #(delete)#:: + + + #(/delete)#
@@ -88,14 +91,15 @@ function updatepage(str) { #{list}#
-
#[count]##(crawler)#::/#[pending]##(/crawler)##(errors)#::/#[exclcount]#/#[failcount]##(/errors)# URLs
+
#[count]##(crawler)#::/#[pending]##(/crawler)##(errors)#::/#[exclcount]#/#[failcount]##(/errors)# URLs
#{/list}#
Count Colors:
   Documents without Errors
   Pending in Crawler
-
   Load Errors (exclusion/failure)
+
   Crawler Excludes
+
   Load Errors
#(/hosts)# diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index e8c2ba3b9..ddfba504e 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -21,15 +21,18 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.ArrayList; +import java.util.Collection; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; +import java.util.regex.Pattern; import org.apache.solr.common.SolrDocument; @@ -45,6 +48,7 @@ import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.HarvestProcess; +import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.Request; import net.yacy.kelondro.data.meta.URIMetadataNode; @@ -53,6 +57,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; +import net.yacy.search.query.QueryParams; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -62,10 +67,10 @@ public class HostBrowser { final static long TIMEOUT = 10000L; public static enum StoreType { - LINK, INDEX, EXCLUDED, FAILED; + LINK, INDEX, EXCLUDED, FAILED, RELOAD; } - @SuppressWarnings("deprecation") + @SuppressWarnings({ "deprecation", "unchecked" }) public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; @@ -223,10 +228,15 @@ public class HostBrowser { if (path.length() > 0) { boolean delete = false; + boolean reload404 = false; if (admin && post.containsKey("delete")) { // delete the complete path!! That includes everything that matches with this prefix. delete = true; } + if (admin && post.containsKey("reload404")) { + // try to re-load all urls that have load errors and matches with this prefix. + reload404 = true; + } int facetcount=post.getInt("facetcount", 0); boolean complete = post.getBoolean("complete"); if (complete) { // we want only root paths for complete lists @@ -289,8 +299,10 @@ public class HostBrowser { Map infoCache = new HashMap(); int hostsize = 0; final List deleteIDs = new ArrayList(); + final Collection reloadURLs = new ArrayList(); + final Set reloadURLCollection = new HashSet(); long timeoutList = System.currentTimeMillis() + TIMEOUT; - long timeoutReferences = System.currentTimeMillis() + 3000; + long timeoutReferences = System.currentTimeMillis() + 6000; ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); @@ -302,10 +314,19 @@ public class HostBrowser { if (delete) { deleteIDs.add(ids); } else { - if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error); + if (error == null) storedDocs.add(u); else { + if (reload404 && error == FailType.fail) { + ArrayList collections = (ArrayList) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName()); + if (collections != null) reloadURLCollection.addAll(collections); + reloadURLs.add(u); + } + if (admin) errorDocs.put(u, error); + } } } else if (complete) { - if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error); + if (error == null) storedDocs.add(u); else { + if (admin) errorDocs.put(u, error); + } } if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link if (error == null) { @@ -337,6 +358,11 @@ public class HostBrowser { if (System.currentTimeMillis() > timeoutList) break; } if (deleteIDs.size() > 0) sb.remove(deleteIDs); + if (reloadURLs.size() > 0) { + final Map cm = new LinkedHashMap(); + for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern); + sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false); + } // collect from crawler List domainStackReferences = (admin) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList(0); @@ -373,17 +399,17 @@ public class HostBrowser { String dir = path + remainingPath; Object c = list.get(dir); if (c == null) { - int[] linkedStoredIncrawlerError = new int[]{0,0,0,0}; + int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0}; if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++; if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++; if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++; - if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[3]++; + if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++; list.put(dir, linkedStoredIncrawlerError); } else if (c instanceof int[]) { if (type == StoreType.LINK) ((int[]) c)[0]++; if (type == StoreType.INDEX) ((int[]) c)[1]++; if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++; - if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[3]++; + if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++; } } } @@ -403,13 +429,14 @@ public class HostBrowser { int linked = ((int[]) entry.getValue())[0]; int stored = ((int[]) entry.getValue())[1]; int crawler = ((int[]) entry.getValue())[2]; - int error = ((int[]) entry.getValue())[3]; + int excl = ((int[]) entry.getValue())[3]; + int error = ((int[]) entry.getValue())[4]; prop.put("files_list_" + c + "_type_stored", stored); prop.put("files_list_" + c + "_type_linked", linked); prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0); prop.put("files_list_" + c + "_type_pending", crawler); - prop.put("files_list_" + c + "_type_excludedVisible", 0); - prop.put("files_list_" + c + "_type_excluded", 0); + prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0); + prop.put("files_list_" + c + "_type_excluded", excl); prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0); prop.put("files_list_" + c + "_type_failed", error); if (++c >= maxcount) break; @@ -443,7 +470,7 @@ public class HostBrowser { } else { String ids = ASCII.String(uri.hash()); InfoCacheEntry ice = infoCache.get(ids); - prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail; " + ice.toString()); + prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString())); } } if (loadRight) { diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index dae006c19..d0d3b5b33 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -304,7 +304,7 @@ public class Load_RSS_p { ConcurrentLog.logException(e); } } - sb.addToIndex(list, null, null, collections); + sb.addToIndex(list, null, null, collections, true); } if (rss != null && post.containsKey("indexAllItemContent")) { diff --git a/htroot/env/base.css b/htroot/env/base.css index 743faa3f3..aa438b5d8 100644 --- a/htroot/env/base.css +++ b/htroot/env/base.css @@ -1030,9 +1030,6 @@ div#info:hover span { z-index: 100; } -.info { - float:left; -} .info span { display: none; diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 039483386..470ffd99d 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -114,7 +114,7 @@ public class RSSLoader extends Thread { indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); loadCount++; } - sb.addToIndex(list, null, null, collections); + sb.addToIndex(list, null, null, collections, true); // update info for loading try { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0ae180ce0..8e8df2174 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2902,7 +2902,8 @@ public final class Switchboard extends serverSwitch { final Map links, final SearchEvent searchEvent, final String heuristicName, - final Map collections) { + final Map collections, + final boolean doublecheck) { List urls = new ArrayList(); // add the landing page to the index. should not load that again since it should be in the cache @@ -2922,19 +2923,39 @@ public final class Switchboard extends serverSwitch { for (final Map.Entry entry : links.entrySet()) { urls.add(new DigestURL(entry.getKey(), (byte[]) null)); } - addToIndex(urls, searchEvent, heuristicName, collections); + addToIndex(urls, searchEvent, heuristicName, collections, doublecheck); + } + + public void reload(final Collection reloadURLStrings, final Map collections, final boolean doublecheck) { + final Collection reloadURLs = new ArrayList(reloadURLStrings.size()); + Collection deleteIDs = new ArrayList(reloadURLStrings.size()); + for (String u: reloadURLStrings) { + DigestURL url; + try { + url = new DigestURL(u); + reloadURLs.add(url); + deleteIDs.add(ASCII.String(url.hash())); + } catch (MalformedURLException e) { + } + } + remove(deleteIDs); + if (doublecheck) this.index.fulltext().commit(false); // if not called here the double-cgeck in addToIndex will reject the indexing + addToIndex(reloadURLs, null, null, collections, doublecheck); } public void remove(final Collection deleteIDs) { this.index.fulltext().remove(deleteIDs); for (String id: deleteIDs) { - this.crawlQueues.removeURL(ASCII.getBytes(id)); + byte[] idh = ASCII.getBytes(id); + this.crawlQueues.removeURL(idh); + try {Cache.delete(idh);} catch (IOException e) {} } } public void remove(final byte[] urlhash) { this.index.fulltext().remove(urlhash); this.crawlQueues.removeURL(urlhash); + try {Cache.delete(urlhash);} catch (IOException e) {} } public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) { @@ -3083,17 +3104,17 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections) { + public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, boolean doublecheck) { Map urlmap = new HashMap(); for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url); if (searchEvent != null) { for (String id: urlmap.keySet()) searchEvent.addHeuristic(ASCII.getBytes(id), heuristicName, true); } - final Set existing = this.index.exists(urlmap.keySet()); + final Set existing = doublecheck ? this.index.exists(urlmap.keySet()) : null; final List requests = new ArrayList(); for (Map.Entry e: urlmap.entrySet()) { final String urlName = e.getValue().toNormalform(true); - if (existing.contains(e.getKey())) { + if (doublecheck && existing.contains(e.getKey())) { this.log.info("addToIndex: double " + urlName); continue; } @@ -3493,7 +3514,7 @@ public final class Switchboard extends serverSwitch { } // add all pages to the index - addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site")); + addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"), true); } } catch (final Throwable e ) { ConcurrentLog.logException(e); @@ -3607,7 +3628,7 @@ public final class Switchboard extends serverSwitch { + feedName + "' rss feed"); // add all pages to the index - addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss")); + addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true); } } catch (final Throwable e ) { //Log.logException(e);