From 0550b54d5603f1789c582d8b9b28e5927037f3d8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 14 Nov 2014 16:34:55 +0100 Subject: [PATCH] added fix to postprocessing: avoid caching of postprocessing collection to always get fresh lists of documents. This is necessary since the postprocessing changes the same documents which the postprocessing-collection query selects. --- htroot/Vocabulary_p.html | 11 +++++---- htroot/api/status_p.java | 2 +- .../schema/CollectionConfiguration.java | 24 +++++++++---------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/htroot/Vocabulary_p.html b/htroot/Vocabulary_p.html index 703e164f3..44ca4dbde 100644 --- a/htroot/Vocabulary_p.html +++ b/htroot/Vocabulary_p.html @@ -112,11 +112,12 @@ To see a list of all APIs, please visit the
Discover Terms:
- no auto-discovery (empty vocabulary)   - from file name   - from page title   - from page title (splitted)   - from page author
+ no auto-discovery (empty vocabulary)   + from file name   + from page title   + from page title (splitted)   + from page author + from a csv file
diff --git a/htroot/api/status_p.java b/htroot/api/status_p.java index 8b758ea6b..a46021f3b 100644 --- a/htroot/api/status_p.java +++ b/htroot/api/status_p.java @@ -142,7 +142,7 @@ public class status_p { //postprocessingCollection1Count = 0; //postprocessingsWebgraphCount = 0; long collectionRemainingCount = 0, webgraphRemainingCount = 0; - if (processCollection) try {collectionRemainingCount = sb.index.fulltext().getDefaultConnector().getCountByQuery(CollectionConfiguration.collection1query(sb.index, null));} catch (IOException e) {} + if (processCollection) try {collectionRemainingCount = sb.index.fulltext().getDefaultConnector().getCountByQuery("{!cache=false}" + CollectionConfiguration.collection1query(sb.index, null));} catch (IOException e) {} if (processWebgraph) try {webgraphRemainingCount = sb.index.fulltext().getWebgraphConnector().getCountByQuery(CollectionConfiguration.webgraphquery(sb.index, null));} catch (IOException e) {} long countSinceStart = CollectionConfiguration.postprocessingRunning ? CollectionConfiguration.postprocessingCollection1Count + CollectionConfiguration.postprocessingWebgraphCount - collectionRemainingCount - webgraphRemainingCount : 0; int speed = timeSinceStart == 0 ? 0 : (int) (60000 * countSinceStart / timeSinceStart); // pages per minute diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 2f6d9c5e6..f153e8070 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -993,13 +993,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri public static final String collection1query(final Segment segment, final String harvestkey) { return (harvestkey == null || !segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.harvestkey_s) ? - "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + - CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; + "" : CollectionSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + + CollectionSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; } public static final String webgraphquery(final Segment segment, final String harvestkey) { return (harvestkey == null || !segment.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.harvestkey_s) ? - "" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + - WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; + "" : WebgraphSchema.harvestkey_s.getSolrFieldName() + ":\"" + harvestkey + "\" AND ") + + WebgraphSchema.process_sxt.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; } /** @@ -1025,8 +1025,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri postprocessingActivity = "collecting counts"; ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); try { - postprocessingCollection1Count = (int) collectionConnector.getCountByQuery(collection1query); - postprocessingWebgraphCount = segment.fulltext().useWebgraph() ? (int) segment.fulltext().getWebgraphConnector().getCountByQuery(webgraphquery) : 0; + postprocessingCollection1Count = (int) collectionConnector.getCountByQuery("{!cache=false}" + collection1query); + postprocessingWebgraphCount = segment.fulltext().useWebgraph() ? (int) segment.fulltext().getWebgraphConnector().getCountByQuery("{!cache=false}" + webgraphquery) : 0; } catch (IOException e) { postprocessingCollection1Count = -1; postprocessingWebgraphCount = -1; @@ -1037,7 +1037,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); ReversibleScoreMap collection1hosts; try { - Map> hostfacet = collectionConnector.getFacets(collection1query, 10000000, CollectionSchema.host_s.getSolrFieldName()); + Map> hostfacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 10000000, CollectionSchema.host_s.getSolrFieldName()); collection1hosts = hostfacet.get(CollectionSchema.host_s.getSolrFieldName()); } catch (final IOException e2) { ConcurrentLog.logException(e2); @@ -1265,20 +1265,20 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final AtomicInteger proccount_citationchange = new AtomicInteger(); try { // partitioning of the index, get a facet for a partitioning key - final long count = collectionConnector.getCountByQuery(collection1query); + final long count = collectionConnector.getCountByQuery("{!cache=false}" + collection1query); String partitioningKey = CollectionSchema.responsetime_i.getSolrFieldName(); postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey + ", partitioned by " + partitioningKey; if (count > 0) { - Map> partitioningFacet = collectionConnector.getFacets(collection1query, 100000, partitioningKey); + Map> partitioningFacet = collectionConnector.getFacets("{!cache=false}" + collection1query, 100000, partitioningKey); ReversibleScoreMap partitioning = partitioningFacet.get(partitioningKey); - long emptyCount = collectionConnector.getCountByQuery("-" + partitioningKey + ":[* TO *] AND (" + collection1query + ")"); + long emptyCount = collectionConnector.getCountByQuery("{!cache=false}" + "-" + partitioningKey + ":[* TO *] AND (" + collection1query + ")"); if (emptyCount > 0) partitioning.inc("", (int) emptyCount); final long start = System.currentTimeMillis(); List querystrings = new ArrayList<>(partitioning.size()); for (String partitioningValue: partitioning) { - String partitioningQuery = (partitioningValue.length() == 0) ? + String partitioningQuery = "{!cache=false}" + ((partitioningValue.length() == 0) ? "-" + partitioningKey + ":[* TO *] AND (" + collection1query + ")" : - partitioningKey + ":" + partitioningValue + " AND (" + collection1query + ")"; + partitioningKey + ":" + partitioningValue + " AND (" + collection1query + ")"); querystrings.add(partitioningQuery); } // start collection of documents