From 790f103f328bdb0307c98eab64587c16846519bb Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 18 Feb 2014 01:38:56 +0100 Subject: [PATCH] delete fail-docs during postprocessing to prevent that they will appear again and stay in postprocessing forever. --- .../yacy/search/schema/CollectionConfiguration.java | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index eb1080e4a..2ae516356 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1063,12 +1063,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100); int countcheck = 0; + Collection failids = new ArrayList(); while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { // for each to-be-processed entry work on the process tag Collection proctags = doc.getFieldValues(CollectionSchema.process_sxt.getSolrFieldName()); - + final String u = (String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()); + final String i = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()); try { - DigestURL url = new DigestURL((String) doc.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes((String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName()))); + DigestURL url = new DigestURL(u, ASCII.getBytes(i)); byte[] id = url.hash(); SolrInputDocument sid = this.toSolrInputDocument(doc); @@ -1121,9 +1123,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (proccount % 100 == 0) ConcurrentLog.info("CollectionConfiguration", "postprocessed " + proccount + " from " + count + " documents; " + (proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining"); } catch (final Throwable e1) { ConcurrentLog.logException(e1); + failids.add(i); } countcheck++; } + if (failids.size() > 0) { + ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: deleting " + failids.size() + " documents which have permanent execution fails"); + collectionConnector.deleteByIds(failids); + } if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_clickdepthchange + " clickdepth changes, " +