Merge pull request #420 from iansmirlis/IndexBrowser_p

In case of reload404, load only failed documents
pull/461/head
Burkhard 3 years ago committed by GitHub
commit b943e0fb03
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -370,6 +370,10 @@ public class IndexBrowser_p {
q.append(" AND ").append(CollectionSchema.url_paths_sxt.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
}
}
// if reload errors is requested, limit results to pages with some failreason set
if(reload404) q.append(" AND ").append(CollectionSchema.failtype_s.getSolrFieldName()).append(AbstractSolrConnector.CATCHALL_DTERM);
final int pageSize = 100;
final BlockingQueue<SolrDocument> docs = new ArrayBlockingQueue<>(pageSize);
final List<String> queries = new ArrayList<>();
@ -387,9 +391,7 @@ public class IndexBrowser_p {
CollectionSchema.references_i.getSolrFieldName(),
CollectionSchema.references_internal_i.getSolrFieldName(),
CollectionSchema.references_external_i.getSolrFieldName(),
CollectionSchema.references_exthosts_i.getSolrFieldName(),
CollectionSchema.cr_host_chance_d.getSolrFieldName(),
CollectionSchema.cr_host_norm_i.getSolrFieldName()
CollectionSchema.references_exthosts_i.getSolrFieldName()
));
solrQueryTask.start();
Set<String> storedDocs = new HashSet<String>();
@ -403,7 +405,7 @@ public class IndexBrowser_p {
final Set<String> reloadURLCollection = new HashSet<String>();
long timeoutList = System.currentTimeMillis() + TIMEOUT;
long remainingTime = TIMEOUT;
long timeoutReferences = System.currentTimeMillis() + 6000;
long timeoutReferences = System.currentTimeMillis() + TIMEOUT;
ReferenceReportCache rrCache = sb.index.getReferenceReportCache();
try {
SolrDocument doc = docs.poll(remainingTime, TimeUnit.MILLISECONDS);
@ -474,6 +476,7 @@ public class IndexBrowser_p {
final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
for (String collection: reloadURLCollection) cm.put(collection, QueryParams.catchall_pattern);
sb.reload(reloadURLs, cm.size() > 0 ? cm : CrawlProfile.collectionParser("user"), false);
}
// collect from crawler
@ -486,7 +489,7 @@ public class IndexBrowser_p {
for (String u: storedDocs) files.put(u, StoreType.INDEX);
for (Map.Entry<String, FailType> e: errorDocs.entrySet()) files.put(e.getKey(), e.getValue() == FailType.fail ? StoreType.FAILED : StoreType.EXCLUDED);
for (String u: inboundLinks) if (!files.containsKey(u)) files.put(u, StoreType.LINK);
for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
if(!reload404) for (String u: loadingLinks) if (u.startsWith(path) && !files.containsKey(u)) files.put(u, StoreType.LINK);
ConcurrentLog.info("IndexBrowser_p", "collected " + files.size() + " urls for path " + path);
// distinguish files and folders

Loading…
Cancel
Save