added button in host browser to re-load 404/failed documents

pull/1/head
Michael Peter Christen 11 years ago
parent f47067b0ce
commit 8b14e92ba4

@ -75,8 +75,11 @@ function updatepage(str) {
<fieldset class="yacys">
Host/URL:
<input id="search" type="text" name="path" value="#[path]#" size="80" maxlength="250" />
<input type="submit" name="list" value="Browse Host" class="submitready" style="width:240px;"/>
#(delete)#::<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:240px;" onclick="return confirm('Confirm Deletion')"/>#(/delete)#
<input type="submit" name="list" value="Browse Host" class="submitready" style="width:160px;"/>
#(delete)#::
<input type="submit" name="delete" value="Delete Subpath" class="submitready" style="width:160px;" onclick="return confirm('Confirm Deletion')"/>
<input type="submit" name="reload404" value="Re-Load 404" class="submitready" style="width:160px;"/>
#(/delete)#
<br />
<div id="searchresults"></div>
</fieldset>
@ -88,14 +91,15 @@ function updatepage(str) {
#{list}#
<div style="float:left; padding:1px 5px 1px 5px;">
<div style="width:180px; text-align:left; float: left; white-space:nowrap; overflow:hidden;"><div id="info"><img src="/env/grafics/#(type)#invisible.png::burn-e.gif::construction.gif#(/type)#" align="left" width="12" height="8">&nbsp;<a href="/HostBrowser.html?path=#[host]#&facetcount=#[count]#">#[host]#</a></div></div>
<div style="width:120px; text-align:right; float: left; white-space:nowrap; overflow:hidden;"><span class="commit">#[count]#</span>#(crawler)#::/<span class="pending">#[pending]#</span>#(/crawler)##(errors)#::/<span class="error">#[exclcount]#/#[failcount]#</span>#(/errors)# URLs</div>
<div style="width:120px; text-align:right; float: left; white-space:nowrap; overflow:hidden;"><span class="commit">#[count]#</span>#(crawler)#::/<span class="pending">#[pending]#</span>#(/crawler)##(errors)#::/<span class="info">#[exclcount]#</span>/<span class="error">#[failcount]#</span>#(/errors)# URLs</div>
</div>
#{/list}#
<div style="clear:both; float:left; padding:10px 5px 1px 5px;">
<div style="float:left;clear:both;">Count Colors:</div>
<div class="commit" style="float:left;">&nbsp;&nbsp;&nbsp;Documents without Errors</div>
<div class="pending" style="float:left;">&nbsp;&nbsp;&nbsp;Pending in Crawler</div>
<div class="error" style="float:left;">&nbsp;&nbsp;&nbsp;Load Errors (exclusion/failure)</div>
<div class="info" style="float:left;">&nbsp;&nbsp;&nbsp;Crawler Excludes</div>
<div class="error" style="float:left;">&nbsp;&nbsp;&nbsp;Load Errors</div>
</div>
</fieldset>
#(/hosts)#

@ -21,15 +21,18 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
@ -45,6 +48,7 @@ import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.Request;
import net.yacy.kelondro.data.meta.URIMetadataNode;
@ -53,6 +57,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -62,10 +67,10 @@ public class HostBrowser {
final static long TIMEOUT = 10000L;
public static enum StoreType {
LINK, INDEX, EXCLUDED, FAILED;
LINK, INDEX, EXCLUDED, FAILED, RELOAD;
}
@SuppressWarnings("deprecation")
@SuppressWarnings({ "deprecation", "unchecked" })
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
final Switchboard sb = (Switchboard) env;
@ -223,10 +228,15 @@ public class HostBrowser {
if (path.length() > 0) {
boolean delete = false;
boolean reload404 = false;
if (admin && post.containsKey("delete")) {
// delete the complete path!! That includes everything that matches with this prefix.
delete = true;
}
if (admin && post.containsKey("reload404")) {
// try to re-load all urls that have load errors and matches with this prefix.
reload404 = true;
}
int facetcount=post.getInt("facetcount", 0);
boolean complete = post.getBoolean("complete");
if (complete) { // we want only root paths for complete lists
@ -289,8 +299,10 @@ public class HostBrowser {
Map<String, InfoCacheEntry> infoCache = new HashMap<String, InfoCacheEntry>();
int hostsize = 0;
final List<String> deleteIDs = new ArrayList<String>();
final Collection<String> reloadURLs = new ArrayList<String>();
final Set<String> reloadURLCollection = new HashSet<String>();
long timeoutList = System.currentTimeMillis() + TIMEOUT;
long timeoutReferences = System.currentTimeMillis() + 3000;
long timeoutReferences = System.currentTimeMillis() + 6000;
ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
@ -302,10 +314,19 @@ public class HostBrowser {
if (delete) {
deleteIDs.add(ids);
} else {
if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error);
if (error == null) storedDocs.add(u); else {
if (reload404 && error == FailType.fail) {
ArrayList<String> collections = (ArrayList<String>) doc.getFieldValue(CollectionSchema.collection_sxt.getSolrFieldName());
if (collections != null) reloadURLCollection.addAll(collections);
reloadURLs.add(u);
}
if (admin) errorDocs.put(u, error);
}
}
} else if (complete) {
if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error);
if (error == null) storedDocs.add(u); else {
if (admin) errorDocs.put(u, error);
}
}
if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link
if (error == null) {
@ -337,6 +358,11 @@ public class HostBrowser {
if (System.currentTimeMillis() > timeoutList) break;
}
if (deleteIDs.size() > 0) sb.remove(deleteIDs);
if (reloadURLs.size() > 0) {
final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern);
sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false);
}
// collect from crawler
List<Request> domainStackReferences = (admin) ? sb.crawlQueues.noticeURL.getDomainStackReferences(StackType.LOCAL, host, 1000, 3000) : new ArrayList<Request>(0);
@ -373,17 +399,17 @@ public class HostBrowser {
String dir = path + remainingPath;
Object c = list.get(dir);
if (c == null) {
int[] linkedStoredIncrawlerError = new int[]{0,0,0,0};
int[] linkedStoredIncrawlerError = new int[]{0,0,0,0,0};
if (type == StoreType.LINK) linkedStoredIncrawlerError[0]++;
if (type == StoreType.INDEX) linkedStoredIncrawlerError[1]++;
if (loadingLinks.contains(entry.getKey())) linkedStoredIncrawlerError[2]++;
if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[3]++;
if (errorDocs.containsKey(entry.getKey())) linkedStoredIncrawlerError[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
list.put(dir, linkedStoredIncrawlerError);
} else if (c instanceof int[]) {
if (type == StoreType.LINK) ((int[]) c)[0]++;
if (type == StoreType.INDEX) ((int[]) c)[1]++;
if (loadingLinks.contains(entry.getKey())) ((int[]) c)[2]++;
if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[3]++;
if (errorDocs.containsKey(entry.getKey())) ((int[]) c)[errorDocs.get(entry.getKey()) == FailType.excl ? 3 : 4]++;
}
}
}
@ -403,13 +429,14 @@ public class HostBrowser {
int linked = ((int[]) entry.getValue())[0];
int stored = ((int[]) entry.getValue())[1];
int crawler = ((int[]) entry.getValue())[2];
int error = ((int[]) entry.getValue())[3];
int excl = ((int[]) entry.getValue())[3];
int error = ((int[]) entry.getValue())[4];
prop.put("files_list_" + c + "_type_stored", stored);
prop.put("files_list_" + c + "_type_linked", linked);
prop.put("files_list_" + c + "_type_pendingVisible", crawler > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_pending", crawler);
prop.put("files_list_" + c + "_type_excludedVisible", 0);
prop.put("files_list_" + c + "_type_excluded", 0);
prop.put("files_list_" + c + "_type_excludedVisible", excl > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_excluded", excl);
prop.put("files_list_" + c + "_type_failedVisible", error > 0 ? 1 : 0);
prop.put("files_list_" + c + "_type_failed", error);
if (++c >= maxcount) break;
@ -443,7 +470,7 @@ public class HostBrowser {
} else {
String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids);
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail; " + ice.toString());
prop.put("files_list_" + c + "_type_stored_error", failType == FailType.excl ? "excluded from indexing" : "load fail" + (ice == null ? "" : "; " + ice.toString()));
}
}
if (loadRight) {

@ -304,7 +304,7 @@ public class Load_RSS_p {
ConcurrentLog.logException(e);
}
}
sb.addToIndex(list, null, null, collections);
sb.addToIndex(list, null, null, collections, true);
}
if (rss != null && post.containsKey("indexAllItemContent")) {

@ -1030,9 +1030,6 @@ div#info:hover span {
z-index: 100;
}
.info {
float:left;
}
.info span {
display: none;

@ -114,7 +114,7 @@ public class RSSLoader extends Thread {
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
}
sb.addToIndex(list, null, null, collections);
sb.addToIndex(list, null, null, collections, true);
// update info for loading
try {

@ -2902,7 +2902,8 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURL, String> links,
final SearchEvent searchEvent,
final String heuristicName,
final Map<String, Pattern> collections) {
final Map<String, Pattern> collections,
final boolean doublecheck) {
List<DigestURL> urls = new ArrayList<DigestURL>();
// add the landing page to the index. should not load that again since it should be in the cache
@ -2922,19 +2923,39 @@ public final class Switchboard extends serverSwitch {
for (final Map.Entry<DigestURL, String> entry : links.entrySet()) {
urls.add(new DigestURL(entry.getKey(), (byte[]) null));
}
addToIndex(urls, searchEvent, heuristicName, collections);
addToIndex(urls, searchEvent, heuristicName, collections, doublecheck);
}
public void reload(final Collection<String> reloadURLStrings, final Map<String, Pattern> collections, final boolean doublecheck) {
final Collection<DigestURL> reloadURLs = new ArrayList<DigestURL>(reloadURLStrings.size());
Collection<String> deleteIDs = new ArrayList<String>(reloadURLStrings.size());
for (String u: reloadURLStrings) {
DigestURL url;
try {
url = new DigestURL(u);
reloadURLs.add(url);
deleteIDs.add(ASCII.String(url.hash()));
} catch (MalformedURLException e) {
}
}
remove(deleteIDs);
if (doublecheck) this.index.fulltext().commit(false); // if not called here the double-cgeck in addToIndex will reject the indexing
addToIndex(reloadURLs, null, null, collections, doublecheck);
}
public void remove(final Collection<String> deleteIDs) {
this.index.fulltext().remove(deleteIDs);
for (String id: deleteIDs) {
this.crawlQueues.removeURL(ASCII.getBytes(id));
byte[] idh = ASCII.getBytes(id);
this.crawlQueues.removeURL(idh);
try {Cache.delete(idh);} catch (IOException e) {}
}
}
public void remove(final byte[] urlhash) {
this.index.fulltext().remove(urlhash);
this.crawlQueues.removeURL(urlhash);
try {Cache.delete(urlhash);} catch (IOException e) {}
}
public void stackURLs(Set<DigestURL> rootURLs, final CrawlProfile profile, final Set<DigestURL> successurls, final Map<DigestURL,String> failurls) {
@ -3083,17 +3104,17 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws Parser.Failure
*/
public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections) {
public void addToIndex(final Collection<DigestURL> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections, boolean doublecheck) {
Map<String, DigestURL> urlmap = new HashMap<String, DigestURL>();
for (DigestURL url: urls) urlmap.put(ASCII.String(url.hash()), url);
if (searchEvent != null) {
for (String id: urlmap.keySet()) searchEvent.addHeuristic(ASCII.getBytes(id), heuristicName, true);
}
final Set<String> existing = this.index.exists(urlmap.keySet());
final Set<String> existing = doublecheck ? this.index.exists(urlmap.keySet()) : null;
final List<Request> requests = new ArrayList<Request>();
for (Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
final String urlName = e.getValue().toNormalform(true);
if (existing.contains(e.getKey())) {
if (doublecheck && existing.contains(e.getKey())) {
this.log.info("addToIndex: double " + urlName);
continue;
}
@ -3493,7 +3514,7 @@ public final class Switchboard extends serverSwitch {
}
// add all pages to the index
addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"));
addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"), true);
}
} catch (final Throwable e ) {
ConcurrentLog.logException(e);
@ -3607,7 +3628,7 @@ public final class Switchboard extends serverSwitch {
+ feedName
+ "' rss feed");
// add all pages to the index
addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"));
addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"), true);
}
} catch (final Throwable e ) {
//Log.logException(e);

Loading…
Cancel
Save