From 53518a91abdada6eaa5df3ce1efcbf832e0727ba Mon Sep 17 00:00:00 2001 From: Ian Smirlis Date: Thu, 19 Aug 2021 20:49:59 +0300 Subject: [PATCH] In case of reload404, load only failed documents --- htroot/IndexBrowser_p.java | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/htroot/IndexBrowser_p.java b/htroot/IndexBrowser_p.java index 1727e9f9d..6465d10cb 100644 --- a/htroot/IndexBrowser_p.java +++ b/htroot/IndexBrowser_p.java @@ -370,6 +370,10 @@ public class IndexBrowser_p { q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM); } } + + // if reload errors is requested, limit results to pages with some failreason set + if(reload404) q.append(" AND ").append(CollectionSchema.failtype_s.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM); + final int pageSize = 100; final BlockingQueue docs = new ArrayBlockingQueue<>(pageSize); final List queries = new ArrayList<>(); @@ -387,9 +391,7 @@ public class IndexBrowser_p { CollectionSchema.references_i.getSolrFieldName(), CollectionSchema.references_internal_i.getSolrFieldName(), CollectionSchema.references_external_i.getSolrFieldName(), - CollectionSchema.references_exthosts_i.getSolrFieldName(), - CollectionSchema.cr_host_chance_d.getSolrFieldName(), - CollectionSchema.cr_host_norm_i.getSolrFieldName() + CollectionSchema.references_exthosts_i.getSolrFieldName() )); solrQueryTask.start(); Set storedDocs = new HashSet(); @@ -403,7 +405,7 @@ public class IndexBrowser_p { final Set reloadURLCollection = new HashSet(); long timeoutList = System.currentTimeMillis() + TIMEOUT; long remainingTime = TIMEOUT; - long timeoutReferences = System.currentTimeMillis() + 6000; + long timeoutReferences = System.currentTimeMillis() + TIMEOUT; ReferenceReportCache rrCache = sb.index.getReferenceReportCache(); try { SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS); @@ -474,6 +476,7 @@ public class IndexBrowser_p { final Map cm = new LinkedHashMap(); for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern); sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false); + } // collect from crawler @@ -486,7 +489,7 @@ public class IndexBrowser_p { for (String u: storedDocs) files.put(u, StoreType.INDEX); for (Map.Entry e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED); for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK); - for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK); + if(!reload404) for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK); ConcurrentLog.info("IndexBrowser_p", "collected " + files.size() + " urls for path " + path); // distinguish files and folders