diff --git a/htroot/HostBrowser.html b/htroot/HostBrowser.html
index 0f68d2d4b..f4c78e02f 100644
--- a/htroot/HostBrowser.html
+++ b/htroot/HostBrowser.html
@@ -75,8 +75,11 @@ function updatepage(str) {
@@ -88,14 +91,15 @@ function updatepage(str) {
#{list}#
-
#[count]##(crawler)#::/#[pending]##(/crawler)##(errors)#::/#[exclcount]#/#[failcount]##(/errors)# URLs
+
#[count]##(crawler)#::/#[pending]##(/crawler)##(errors)#::/#[exclcount]#/#[failcount]##(/errors)# URLs
#{/list}#
Count Colors:
Documents without Errors
Pending in Crawler
-
Load Errors (exclusion/failure)
+
Crawler Excludes
+
Load Errors
#(/hosts)#
diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java
index e8c2ba3b9..ddfba504e 100644
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@@ -21,15 +21,18 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
+import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
+import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
@@ -45,6 +48,7 @@ import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
+import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@@ -53,6 +57,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
+import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@@ -62,10 +67,10 @@ public class HostBrowser {
final static long TIMEOUT = 10000L;
public static enum StoreType {
- LINK, INDEX, EXCLUDED, FAILED;
+ LINK, INDEX, EXCLUDED, FAILED, RELOAD;
}
- @SuppressWarnings("deprecation")
+ @SuppressWarnings({ "deprecation", "unchecked" })
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
@@ -223,10 +228,15 @@ public class HostBrowser {
if (path.length() > 0) {
boolean delete = false;
+ boolean reload404 = false;
if (admin && post.containsKey("delete")) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true;
}
+ if (admin && post.containsKey("reload404")) {
+ // try to re-load all urls that have load errors and matches with this prefix.
+ reload404 = true;
+ }
int facetcount=post.getInt("facetcount", 0);
boolean complete = post.getBoolean("complete");
if (complete) { // we want only root paths for complete lists
@@ -289,8 +299,10 @@ public class HostBrowser {
Map infoCache = new HashMap();
int hostsize = 0;
final List deleteIDs = new ArrayList();
+ final Collection reloadURLs = new ArrayList();
+ final Set reloadURLCollection = new HashSet();
long timeoutList = System.currentTimeMillis() + TIMEOUT;
- long timeoutReferences = System.currentTimeMillis() + 3000;
+ long timeoutReferences = System.currentTimeMillis() + 6000;
ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@@ -302,10 +314,19 @@ public class HostBrowser {
if (delete) {
deleteIDs.add(ids);
} else {
- if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error);
+ if (error == null) storedDocs.add(u); else {
+ if (reload404 && error == FailType.fail) {
+ ArrayList collections = (ArrayList) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
+ if (collections != null) reloadURLCollection.addAll(collections);
+ reloadURLs.add(u);
+ }
+ if (admin) errorDocs.put(u, error);
+ }
}
} else if (complete) {
- if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error);
+ if (error == null) storedDocs.add(u); else {
+ if (admin) errorDocs.put(u, error);
+ }
}
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
if (error == null) {
@@ -337,6 +358,11 @@ public class HostBrowser {
if (System.currentTimeMillis() > timeoutList) break;
}
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
+ if (reloadURLs.size() > 0) {
+ final Map cm = new LinkedHashMap();
+ for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern);
+ sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false);
+ }
// collect from crawler
List domainStackReferences = (admin) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList(0);
@@ -373,17 +399,17 @@ public class HostBrowser {
String dir = path + remainingPath;
Object c = list.get(dir);
if (c == null) {
- int[] linkedStoredIncrawlerError = new int[]{0,0,0,0};
+ int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0};
if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++;
if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++;
if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++;
- if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[3]++;
+ if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
list.put(dir, linkedStoredIncrawlerError);
} else if (c instanceof int[]) {
if (type == StoreType.LINK) ((int[]) c)[0]++;
if (type == StoreType.INDEX) ((int[]) c)[1]++;
if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++;
- if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[3]++;
+ if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
}
}
}
@@ -403,13 +429,14 @@ public class HostBrowser {
int linked = ((int[]) entry.getValue())[0];
int stored = ((int[]) entry.getValue())[1];
int crawler = ((int[]) entry.getValue())[2];
- int error = ((int[]) entry.getValue())[3];
+ int excl = ((int[]) entry.getValue())[3];
+ int error = ((int[]) entry.getValue())[4];
prop.put("files_list_" + c + "_type_stored", stored);
prop.put("files_list_" + c + "_type_linked", linked);
prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_pending", crawler);
- prop.put("files_list_" + c + "_type_excludedVisible", 0);
- prop.put("files_list_" + c + "_type_excluded", 0);
+ prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0);
+ prop.put("files_list_" + c + "_type_excluded", excl);
prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_failed", error);
if (++c >= maxcount) break;
@@ -443,7 +470,7 @@ public class HostBrowser {
} else {
String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids);
- prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail; " + ice.toString());
+ prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString()));
}
}
if (loadRight) {
diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java
index dae006c19..d0d3b5b33 100644
--- a/htroot/Load_RSS_p.java
+++ b/htroot/Load_RSS_p.java
@@ -304,7 +304,7 @@ public class Load_RSS_p {
ConcurrentLog.logException(e);
}
}
- sb.addToIndex(list, null, null, collections);
+ sb.addToIndex(list, null, null, collections, true);
}
if (rss != null && post.containsKey("indexAllItemContent")) {
diff --git a/htroot/env/base.css b/htroot/env/base.css
index 743faa3f3..aa438b5d8 100644
--- a/htroot/env/base.css
+++ b/htroot/env/base.css
@@ -1030,9 +1030,6 @@ div#info:hover span {
z-index: 100;
}
-.info {
- float:left;
-}
.info span {
display: none;
diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java
index 039483386..470ffd99d 100644
--- a/source/net/yacy/crawler/retrieval/RSSLoader.java
+++ b/source/net/yacy/crawler/retrieval/RSSLoader.java
@@ -114,7 +114,7 @@ public class RSSLoader extends Thread {
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
}
- sb.addToIndex(list, null, null, collections);
+ sb.addToIndex(list, null, null, collections, true);
// update info for loading
try {
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 0ae180ce0..8e8df2174 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2902,7 +2902,8 @@ public final class Switchboard extends serverSwitch {
final Map links,
final SearchEvent searchEvent,
final String heuristicName,
- final Map collections) {
+ final Map collections,
+ final boolean doublecheck) {
List urls = new ArrayList();
// add the landing page to the index. should not load that again since it should be in the cache
@@ -2922,19 +2923,39 @@ public final class Switchboard extends serverSwitch {
for (final Map.Entry entry : links.entrySet()) {
urls.add(new DigestURL(entry.getKey(), (byte[]) null));
}
- addToIndex(urls, searchEvent, heuristicName, collections);
+ addToIndex(urls, searchEvent, heuristicName, collections, doublecheck);
+ }
+
+ public void reload(final Collection reloadURLStrings, final Map collections, final boolean doublecheck) {
+ final Collection reloadURLs = new ArrayList(reloadURLStrings.size());
+ Collection deleteIDs = new ArrayList(reloadURLStrings.size());
+ for (String u: reloadURLStrings) {
+ DigestURL url;
+ try {
+ url = new DigestURL(u);
+ reloadURLs.add(url);
+ deleteIDs.add(ASCII.String(url.hash()));
+ } catch (MalformedURLException e) {
+ }
+ }
+ remove(deleteIDs);
+ if (doublecheck) this.index.fulltext().commit(false); // if not called here the double-cgeck in addToIndex will reject the indexing
+ addToIndex(reloadURLs, null, null, collections, doublecheck);
}
public void remove(final Collection deleteIDs) {
this.index.fulltext().remove(deleteIDs);
for (String id: deleteIDs) {
- this.crawlQueues.removeURL(ASCII.getBytes(id));
+ byte[] idh = ASCII.getBytes(id);
+ this.crawlQueues.removeURL(idh);
+ try {Cache.delete(idh);} catch (IOException e) {}
}
}
public void remove(final byte[] urlhash) {
this.index.fulltext().remove(urlhash);
this.crawlQueues.removeURL(urlhash);
+ try {Cache.delete(urlhash);} catch (IOException e) {}
}
public void stackURLs(Set rootURLs, final CrawlProfile profile, final Set successurls, final Map failurls) {
@@ -3083,17 +3104,17 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws Parser.Failure
*/
- public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections) {
+ public void addToIndex(final Collection urls, final SearchEvent searchEvent, final String heuristicName, final Map collections, boolean doublecheck) {
Map urlmap = new HashMap();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
if (searchEvent != null) {
for (String id: urlmap.keySet()) searchEvent.addHeuristic(ASCII.getBytes(id), heuristicName, true);
}
- final Set existing = this.index.exists(urlmap.keySet());
+ final Set existing = doublecheck ? this.index.exists(urlmap.keySet()) : null;
final List requests = new ArrayList();
for (Map.Entry e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true);
- if (existing.contains(e.getKey())) {
+ if (doublecheck && existing.contains(e.getKey())) {
this.log.info("addToIndex: double " + urlName);
continue;
}
@@ -3493,7 +3514,7 @@ public final class Switchboard extends serverSwitch {
}
// add all pages to the index
- addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"));
+ addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"), true);
}
} catch (final Throwable e ) {
ConcurrentLog.logException(e);
@@ -3607,7 +3628,7 @@ public final class Switchboard extends serverSwitch {
+ feedName
+ "' rss feed");
// add all pages to the index
- addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"));
+ addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true);
}
} catch (final Throwable e ) {
//Log.logException(e);