From 2e5214eb213cd68ebf4e15f3c5aa8178103599ae Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 17 Oct 2014 14:17:49 +0200 Subject: [PATCH] added field postprocessing.partialUpdate to settings which can be used to switch on or off partial updates. Both options should cause the same result. Default is on. --- defaults/yacy.init | 1 + source/net/yacy/search/Switchboard.java | 6 ++-- .../schema/CollectionConfiguration.java | 28 ++++++++++++++----- 3 files changed, 25 insertions(+), 10 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index b8e168308..58614cd31 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1243,6 +1243,7 @@ greedylearning.active = true # postprocessing steering postprocessing.maximum_load = 2.5 postprocessing.minimum_ram = 536870912 +postprocessing.partialUpdate = true # Custom user agents for 'allip' networks: # This user agent is only available if the network is set to 'allip' (which is a non-limited domain 'network' diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 06319f186..9cdc77997 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -989,7 +989,7 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.CLEANUP_METHOD_START, SwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, SwitchboardConstants.CLEANUP_METHOD_FREEMEM, - 60000, + 30000, Long.MAX_VALUE, 10000, Long.MAX_VALUE), @@ -2312,7 +2312,7 @@ public final class Switchboard extends serverSwitch { if (postprocessing) { // run postprocessing on all profiles ReferenceReportCache rrCache = index.getReferenceReportCache(); - proccount += collection1Configuration.postprocessing(index, rrCache, null); + proccount += collection1Configuration.postprocessing(index, rrCache, null, getConfigBool("postprocessing.partialUpdate", true)); this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring } this.crawler.cleanProfiles(this.crawler.getActiveProfiles()); @@ -2325,7 +2325,7 @@ public final class Switchboard extends serverSwitch { if (postprocessing) { // run postprocessing on these profiles ReferenceReportCache rrCache = index.getReferenceReportCache(); - for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash); + for (String profileHash: deletionCandidates) proccount += collection1Configuration.postprocessing(index, rrCache, profileHash, getConfigBool("postprocessing.partialUpdate", true)); this.index.fulltext().commit(true); // without a commit the success is not visible in the monitoring } this.crawler.cleanProfiles(deletionCandidates); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index fced5c2f5..2c2521310 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -994,7 +994,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri * @param urlCitation * @return */ - public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey) { + public int postprocessing(final Segment segment, final ReferenceReportCache rrCache, final String harvestkey, final boolean byPartialUpdate) { if (!this.contains(CollectionSchema.process_sxt)) return 0; if (!segment.connectedCitation() && !segment.fulltext().useWebgraph()) return 0; final SolrConnector collectionConnector = segment.fulltext().getDefaultConnector(); @@ -1262,6 +1262,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CollectionSchema.url_protocol_s.getSolrFieldName() + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false : null, // null sort is faster! 0, 100000000, Long.MAX_VALUE, concurrency + 1, concurrency, true, + byPartialUpdate ? + new String[]{ CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.harvestkey_s.getSolrFieldName(), @@ -1279,7 +1281,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri CollectionSchema.url_protocol_s.getSolrFieldName(), CollectionSchema.httpstatus_i.getSolrFieldName(), CollectionSchema.inboundlinkscount_i.getSolrFieldName(), - CollectionSchema.robots_i.getSolrFieldName()); + CollectionSchema.robots_i.getSolrFieldName()} : + new String[0]); final AtomicInteger proccount = new AtomicInteger(); final AtomicInteger proccount_referencechange = new AtomicInteger(); final AtomicInteger proccount_citationchange = new AtomicInteger(); @@ -1305,7 +1308,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri try { DigestURL url = new DigestURL(u, ASCII.getBytes(i)); byte[] id = url.hash(); - SolrInputDocument sid = new SolrInputDocument(); //collection.toSolrInputDocument(doc, omitFields); + SolrInputDocument sid = byPartialUpdate ? new SolrInputDocument() : collection.toSolrInputDocument(doc, omitFields); sid.setField(CollectionSchema.id.getSolrFieldName(), i); for (Object tag: proctags) try { @@ -1346,13 +1349,24 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } // all processing steps checked, remove the processing and harvesting key - sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update - sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null); + if (byPartialUpdate) { + sid.setField(CollectionSchema.process_sxt.getSolrFieldName(), null); // setting this to null will cause a removal when doing a partial update + sid.setField(CollectionSchema.harvestkey_s.getSolrFieldName(), null); + } else { + sid.removeField(CollectionSchema.process_sxt.getSolrFieldName()); + sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); + } + // with standard solr fields selected, the sid now contains the fields + // id, http_unique_b, www_unique_b, references_i, references_internal_i, references_external_i, references_exthosts_i, host_extent_i + // and the value for host_extent_i is by default 2147483647 // send back to index //collectionConnector.deleteById(i); - collectionConnector.update(sid); - + if (byPartialUpdate) { + collectionConnector.update(sid); + } else { + collectionConnector.add(sid); + } long thiscount = proccount.incrementAndGet(); allcount.incrementAndGet(); if (thiscount % 100 == 0) { postprocessingActivity = "postprocessed " + thiscount + " from " + count + " collection documents; " +