From 1d069c5861876878034a51292952c8e3549d481b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 27 Feb 2014 12:27:15 +0100 Subject: [PATCH] make sure that postprocessed documents are overwritten --- .../yacy/search/schema/CollectionConfiguration.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 80997b3f2..4a5b8b8b4 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1023,6 +1023,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri SolrDocument doc; int allcount = 0; if (segment.fulltext().useWebgraph()) { + Set omitFields = new HashSet(); + omitFields.add(WebgraphSchema.process_sxt.getSolrFieldName()); + omitFields.add(WebgraphSchema.harvestkey_s.getSolrFieldName()); try { int proccount = 0; long start = System.currentTimeMillis(); @@ -1035,7 +1038,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri BlockingQueue docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100); int countcheck = 0; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { - SolrInputDocument sid = webgraph.toSolrInputDocument(doc, null); + SolrInputDocument sid = webgraph.toSolrInputDocument(doc, omitFields); if (webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { byte[] id = ASCII.getBytes((String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName())); CRV crv = ranking.get(id); @@ -1053,6 +1056,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri try { sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); + segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName())); segment.fulltext().getWebgraphConnector().add(sid); } catch (SolrException e) { ConcurrentLog.logException(e); @@ -1082,6 +1086,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Map hostExtentCache = new HashMap(); // a mapping from the host id to the number of documents which contain this host-id Set uniqueURLs = new HashSet(); try { + Set omitFields = new HashSet(); + omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); + omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); int proccount = 0, proccount_clickdepthchange = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; long count = collectionConnector.getCountByQuery(query); long start = System.currentTimeMillis(); @@ -1097,7 +1104,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri try { DigestURL url = new DigestURL(u, ASCII.getBytes(i)); byte[] id = url.hash(); - SolrInputDocument sid = this.toSolrInputDocument(doc); + SolrInputDocument sid = collection.toSolrInputDocument(doc, omitFields); for (Object tag: proctags) { @@ -1141,7 +1148,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); // send back to index - //connector.deleteById(ASCII.String(id)); + collectionConnector.deleteById(i); collectionConnector.add(sid); proccount++; allcount++;