From 4735bd47f45cf6581a8b61625f0c1a12f5692711 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 23 Jan 2013 14:40:58 +0100 Subject: [PATCH] - changed solr commit call and added an optimize option. Since Solr 4.0.0 there is a new softcommit feature which implements a near-real-time (NRT) search option. The softcommit does not do IO and does not cause performance issues. YaCy has now an extension in its solr connectors to use the softcommit feature. The softcommit call now replaces all places where a hard commit was used. Furthermore the commit strategy in when doing a search from the web interface was changed (it's done every time before a search is done). The softcommit feature was implemented because it was needed for the following changes (customer demands), which is also included in this git commit: - added a feature to identify all documents which have unique titles and/or unique descriptions. These unique flags are disabled by default. - added also a feature to set a flag when the url from a canonical tag is equal to the document url. This is also disabled by default. To support the new softcommit strategy, the commitWithinMs option was set to -1 do disable automatic commit based on document insert times. If documents are inserted permanently then also a commit would happen permanently whenever the commitWithinMs time is reached. This would conflict with the regular autocommit of 10 minutes and the new softcommit strategy. --- defaults/yacy.init | 2 +- htroot/Crawler_p.java | 2 +- htroot/HostBrowser.java | 2 +- htroot/IndexFederated_p.java | 2 +- htroot/index.java | 4 -- htroot/yacyinteractive.java | 5 -- htroot/yacysearch.java | 1 - .../solr/connector/EmbeddedSolrConnector.java | 3 +- .../solr/connector/MirrorSolrConnector.java | 17 +++-- .../solr/connector/MultipleSolrConnector.java | 16 +++-- .../solr/connector/RetrySolrConnector.java | 12 +++- .../solr/connector/ShardSolrConnector.java | 12 +++- .../solr/connector/SolrConnector.java | 8 ++- .../solr/connector/SolrServerConnector.java | 25 +++++--- source/net/yacy/migration.java | 2 +- source/net/yacy/search/Switchboard.java | 8 +-- .../net/yacy/search/index/DocumentIndex.java | 2 +- source/net/yacy/search/index/Fulltext.java | 19 ++---- source/net/yacy/search/index/Segment.java | 62 +++++++++++++++++-- .../yacy/search/index/SolrConfiguration.java | 4 ++ source/net/yacy/search/query/SearchEvent.java | 3 + 21 files changed, 150 insertions(+), 61 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index ac0466a7a..980f8147e 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1056,7 +1056,7 @@ color_searchurlhover = #008000 # - to check whats in solr after indexing, open http://localhost:8983/solr/admin/ federated.service.solr.indexing.enabled = false federated.service.solr.indexing.url = http://127.0.0.1:8983/solr -federated.service.solr.indexing.commitWithinMs = 180000 +federated.service.solr.indexing.commitWithinMs = -1 federated.service.solr.indexing.sharding = MODULO_HOST_MD5 federated.service.solr.indexing.schemefile = solr.keys.default.list # the lazy attribute causes that fields containing "" or 0 are not added and not written diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 44063e4d7..389860108 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -371,7 +371,7 @@ public class Crawler_p { try { sb.crawlQueues.errorURL.removeHost(ASCII.getBytes(hosthash)); sb.index.fulltext().getSolr().deleteByQuery(YaCySchema.host_id_s.getSolrFieldName() + ":\"" + hosthash + "\" AND " + YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]"); - sb.index.fulltext().commit(); + sb.index.fulltext().commit(true); } catch (IOException e) {Log.logException(e);} } diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index 550494fa8..d6b73f081 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -96,7 +96,7 @@ public class HostBrowser { } String path = post == null ? "" : post.get("path", "").trim(); - if (admin && path.length() == 0) sb.index.fulltext().commit(); + sb.index.fulltext().commit(true); if (post == null || env == null) { return prop; } diff --git a/htroot/IndexFederated_p.java b/htroot/IndexFederated_p.java index a0a585a8e..333846b4d 100644 --- a/htroot/IndexFederated_p.java +++ b/htroot/IndexFederated_p.java @@ -77,7 +77,7 @@ public class IndexFederated_p { final boolean previous_core_fulltext = sb.index.fulltext().connectedLocalSolr() && env.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, false); env.setConfig(SwitchboardConstants.CORE_SERVICE_FULLTEXT, post_core_fulltext); - final int commitWithinMs = post.getInt("solr.indexing.commitWithinMs", env.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000)); + final int commitWithinMs = post.getInt("solr.indexing.commitWithinMs", env.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, -1)); if (previous_core_fulltext && !post_core_fulltext) { // switch off sb.index.fulltext().disconnectLocalSolr(); diff --git a/htroot/index.java b/htroot/index.java index 845b8e78c..d4f3ec44a 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -61,10 +61,6 @@ public class index { return prop; } } - - if (authorizedAccess) { - sb.index.fulltext().commit(); // call this only as superuser to prevent that this can be misused for DoS - } boolean global = (post == null) ? true : post.get("resource", "global").equals("global"); final boolean focus = (post == null) ? true : post.get("focus", "1").equals("1"); diff --git a/htroot/yacyinteractive.java b/htroot/yacyinteractive.java index 27b794fd8..8a050dcd2 100644 --- a/htroot/yacyinteractive.java +++ b/htroot/yacyinteractive.java @@ -47,11 +47,6 @@ public class yacyinteractive { prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, "")); prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, "")); - final boolean admin = sb.verifyAuthentication(header); - if (admin) { - sb.index.fulltext().commit(); - } - final String query = (post == null) ? "" : post.get("query", ""); final String startRecord = (post == null) ? "0" : post.get("startRecord", ""); final String maximumRecords = (post == null) ? "10" : post.get("maximumRecords", ""); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index e5395e6bd..f7f42da98 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -117,7 +117,6 @@ public class yacysearch { final String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); String querystring = originalquerystring.replace('+', ' ').trim(); CacheStrategy snippetFetchStrategy = (post == null) ? null : CacheStrategy.parse(post.get("verify", sb.getConfig("search.verify", ""))); - if (authenticated && originalquerystring.length() == 0) sb.index.fulltext().commit(); final servletProperties prop = new servletProperties(); prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 7293a852f..6a204f4c5 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -146,7 +146,7 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo @Override public synchronized void close() { - try {this.commit();} catch (Throwable e) {Log.logException(e);} + try {this.commit(false);} catch (Throwable e) {Log.logException(e);} try {super.close();} catch (Throwable e) {Log.logException(e);} try {this.defaultCore.close();} catch (Throwable e) {Log.logException(e);} try {this.cores.shutdown();} catch (Throwable e) {Log.logException(e);} @@ -213,7 +213,6 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo storage.mkdirs(); try { EmbeddedSolrConnector solr = new EmbeddedSolrConnector(storage, solr_config); - solr.setCommitWithinMs(100); SolrInputDocument doc = new SolrInputDocument(); doc.addField(YaCySchema.id.name(), "ABCD0000abcd"); doc.addField(YaCySchema.title.name(), "Lorem ipsum"); diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index e38a8bb3c..c5b95db64 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -171,9 +171,18 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public void commit() { - if (this.solr0 != null) this.solr0.commit(); - if (this.solr1 != null) this.solr1.commit(); + public void commit(boolean softCommit) { + if (this.solr0 != null) this.solr0.commit(softCommit); + if (this.solr1 != null) this.solr1.commit(softCommit); + } + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments) { + if (this.solr0 != null) this.solr0.optimize(maxSegments); + if (this.solr1 != null) this.solr1.optimize(maxSegments); } @Override @@ -320,7 +329,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo // check if there is a autocommit problem if (c.hitCache.containsKey(key)) { // the document should be there, therefore make a commit and check again - this.commit(); + this.commit(true); if ((solr0 != null && ((doc = solr0.getById(key, fields)) != null)) || (solr1 != null && ((doc = solr1.getById(key, fields)) != null))) { addToCache(doc, fields.length == 0); return doc; diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java index 2ee21794a..d34518901 100644 --- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java @@ -47,7 +47,7 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr this.solr = new RemoteSolrConnector(url); this.queue = new ArrayBlockingQueue(1000); this.worker = new AddWorker[connections]; - this.commitWithinMs = 180000; + this.commitWithinMs = -1; for (int i = 0; i < connections; i++) { this.worker[i] = new AddWorker(url); this.worker[i].start(); @@ -58,7 +58,7 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr private final SolrConnector solr; public AddWorker(final String url) throws IOException { this.solr = new RemoteSolrConnector(url); - this.solr.setCommitWithinMs(MultipleSolrConnector.this.commitWithinMs); + if (MultipleSolrConnector.this.commitWithinMs >= 0 ) this.solr.setCommitWithinMs(MultipleSolrConnector.this.commitWithinMs); } @Override public void run() { @@ -97,8 +97,16 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public void commit() { - this.solr.commit(); + public void commit(boolean softCommit) { + this.solr.commit(softCommit); + } + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments) { + this.solr.optimize(maxSegments); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java index 6016e1f8d..f9de62b97 100644 --- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java @@ -59,8 +59,16 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public void commit() { - this.solrConnector.commit(); + public void commit(boolean softCommit) { + this.solrConnector.commit(softCommit); + } + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments) { + this.solrConnector.optimize(maxSegments); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java index 1b7c5089f..386a1c34b 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java @@ -73,8 +73,16 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public void commit() { - for (final SolrConnector connector: this.connectors) connector.commit(); + public void commit(boolean softCommit) { + for (final SolrConnector connector: this.connectors) connector.commit(softCommit); + } + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments) { + for (final SolrConnector connector: this.connectors) connector.optimize(maxSegments); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 8a7ae6ba6..b05094939 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -51,7 +51,13 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * force a commit */ - public void commit(); + public void commit(boolean softCommit); + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments); /** * close the server connection diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 74bcfb241..a0f65a5df 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -91,6 +91,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen /** * set the solr autocommit delay + * when doing continuous inserts, don't set this value because it would cause continuous commits * @param c the maximum waiting time after a solr command until it is transported to the server */ @Override @@ -99,9 +100,21 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen } @Override - public synchronized void commit() { + public synchronized void commit(final boolean softCommit) { try { - this.server.commit(); + this.server.commit(true, true, softCommit); + } catch (SolrServerException e) { + } catch (IOException e) { + } + } + + /** + * force an explicit merge of segments + * @param maxSegments the maximum number of segments. Set to 1 for maximum optimization + */ + public void optimize(int maxSegments) { + try { + this.server.optimize(true, true, maxSegments); } catch (SolrServerException e) { } catch (IOException e) { } @@ -110,7 +123,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen @Override public synchronized void close() { try { - if (this.server != null) synchronized (this.server) {this.server.commit();} + if (this.server != null) synchronized (this.server) {this.server.commit(true, true, false);} this.server = null; } catch (SolrServerException e) { log.warn(e); @@ -194,7 +207,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen try { synchronized (this.server) { this.server.deleteByQuery("*:*"); - this.server.commit(); + this.server.commit(true, true, false); } } catch (final Throwable e) { throw new IOException(e); @@ -234,7 +247,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen synchronized (this.server) { long c0 = this.getQueryCount(querystring); this.server.deleteByQuery(querystring, this.commitWithinMs); - this.commit(); + this.commit(true); long c1 = this.getQueryCount(querystring); return (int) (c1 - c0); } @@ -254,7 +267,6 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen try { synchronized (this.server) { this.server.request(up); - //this.server.commit(); } } catch (final Throwable e) { throw new IOException(e); @@ -273,7 +285,6 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen // catches "version conflict for": try this again and delete the document in advance try { this.server.deleteById((String) solrdoc.getFieldValue(YaCySchema.id.getSolrFieldName())); - //this.server.commit(); } catch (SolrServerException e1) {} try { synchronized (this.server) { diff --git a/source/net/yacy/migration.java b/source/net/yacy/migration.java index f7c446356..e7333246e 100644 --- a/source/net/yacy/migration.java +++ b/source/net/yacy/migration.java @@ -319,7 +319,7 @@ public class migration { } Log.logInfo("migrateUrldbtoSolr", Integer.toString(i) + " entries left (convert next chunk of 1000 entries)"); } - ft.commit(); + ft.commit(true); } catch (IOException ex) { Log.logInfo("migrateUrldbtoSolr", "error reading old urldb index"); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 56940be57..1f78ba0d2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -431,7 +431,7 @@ public final class Switchboard extends serverSwitch { ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); this.index = new Segment(this.log, new File(segmentsPath, "default"), solrScheme); - final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000); + final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, -1); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { @@ -1288,7 +1288,7 @@ public final class Switchboard extends serverSwitch { this.useTailCache, this.exceed134217727); this.index = new Segment(this.log, new File(new File(new File(indexPrimaryPath, networkName), "SEGMENTS"), "default"), solrScheme); - final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, 180000); + final int connectWithinMs = this.getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_COMMITWITHINMS, -1); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) this.index.connectRWI(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_CITATION, true)) this.index.connectCitation(wordCacheMaxCount, fileSizeMax); if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, true)) { @@ -1306,7 +1306,7 @@ public final class Switchboard extends serverSwitch { solrurls, ShardSelection.Method.MODULO_HOST_MD5, 10000, true); - solr.setCommitWithinMs(connectWithinMs); + if (connectWithinMs >= 0) solr.setCommitWithinMs(connectWithinMs); this.index.fulltext().connectRemoteSolr(solr); } catch ( final IOException e ) { Log.logException(e); @@ -2232,7 +2232,7 @@ public final class Switchboard extends serverSwitch { // execute the (post-) processing steps for all entries that have a process tag assigned if (this.crawlQueues.coreCrawlJobSize() == 0 && index.connectedCitation() && index.fulltext().getSolrScheme().contains(YaCySchema.process_sxt)) { // that means we must search for those entries. - index.fulltext().getSolr().commit(); // make sure that we have latest information that can be found + index.fulltext().getSolr().commit(true); // make sure that we have latest information that can be found BlockingQueue docs = index.fulltext().getSolr().concurrentQuery(YaCySchema.process_sxt.getSolrFieldName() + ":[* TO *]", 0, 1000, 60000, 10); SolrDocument doc; int proccount_clickdepth = 0; diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 518bf21a7..bdebf8c30 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -70,7 +70,7 @@ public class DocumentIndex extends Segment { false, // useTailCache false // exceed134217727 ); - super.fulltext().connectLocalSolr(1000); + super.fulltext().connectLocalSolr(-1); final int cores = Runtime.getRuntime().availableProcessors() + 1; this.callback = callback; this.queue = new LinkedBlockingQueue(cores * 300); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 036269e1a..126536ed3 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -77,8 +77,6 @@ public final class Fulltext implements Iterable { private static final String SOLR_PATH = "solr_40"; // the number should be identical to the number in the property luceneMatchVersion in solrconfig.xml private static final String SOLR_OLD_PATH[] = new String[]{"solr_36"}; - private static final long forcedCommitTimeout = 3000; // wait this time until a next forced commit is executed - // class objects private final File location; private Index urlIndexFile; @@ -87,7 +85,6 @@ public final class Fulltext implements Iterable { private ArrayList statsDump; private final MirrorSolrConnector solr; private final SolrConfiguration solrScheme; - private long forcedCommitTime; protected Fulltext(final File path, final SolrConfiguration solrScheme) { this.location = path; @@ -97,7 +94,6 @@ public final class Fulltext implements Iterable { this.statsDump = null; this.solr = new MirrorSolrConnector(10000, 10000, 100); this.solrScheme = solrScheme; - this.forcedCommitTime = 0; } /** @@ -158,7 +154,7 @@ public final class Fulltext implements Iterable { if (oldLocation.exists()) oldLocation.renameTo(solrLocation); } EmbeddedSolrConnector esc = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath, "defaults"), "solr")); - esc.setCommitWithinMs(commitWithin); + if (commitWithin >= 0) esc.setCommitWithinMs(commitWithin); Version luceneVersion = esc.getConfig().getLuceneVersion("luceneMatchVersion"); String lvn = luceneVersion.name(); int p = lvn.indexOf('_'); @@ -239,11 +235,8 @@ public final class Fulltext implements Iterable { return this.solr.getCommitWithinMs(); } - public void commit() { - if (this.forcedCommitTime + forcedCommitTimeout > System.currentTimeMillis()) return; - this.forcedCommitTime = Long.MAX_VALUE - forcedCommitTimeout; // set the time high to prevent that other processes get to this point meanwhile - this.solr.commit(); - this.forcedCommitTime = System.currentTimeMillis(); // set the exact time + public void commit(boolean softCommit) { + this.solr.commit(softCommit); } public Date getLoadDate(final String urlHash) { @@ -378,7 +371,7 @@ public final class Fulltext implements Iterable { synchronized (Fulltext.this.solr) { try { count.addAndGet(Fulltext.this.solr.deleteByQuery(q)); - if (count.get() > 0) Fulltext.this.solr.commit(); + if (count.get() > 0) Fulltext.this.solr.commit(true); } catch (IOException e) {} } @@ -444,7 +437,7 @@ public final class Fulltext implements Iterable { count.incrementAndGet(); } } - if (count.get() > 0) Fulltext.this.solr.commit(); + if (count.get() > 0) Fulltext.this.solr.commit(true); } catch (InterruptedException e) {} } }; @@ -466,7 +459,7 @@ public final class Fulltext implements Iterable { for (byte[] urlHash: deleteIDs) { Fulltext.this.solr.delete(ASCII.String(urlHash)); } - Fulltext.this.solr.commit(); + Fulltext.this.solr.commit(true); } } catch (final Throwable e) { Log.logException(e); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index fe30c713e..d9a0709b2 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -35,6 +35,9 @@ import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; +import org.apache.solr.client.solrj.util.ClientUtils; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; @@ -68,6 +71,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.ISO639; +import net.yacy.kelondro.util.MemoryControl; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -347,6 +351,21 @@ public class Segment { ) { final long startTime = System.currentTimeMillis(); + // DO A SOFT/HARD COMMIT IF NEEDED + if (MemoryControl.shortStatus()) { + // do a 'hard' commit to flush index caches + this.fulltext.getSolr().commit(false); + } else { + if ( + (this.fulltext.getSolrScheme().contains(YaCySchema.exact_signature_l) && this.fulltext.getSolrScheme().contains(YaCySchema.exact_signature_unique_b)) || + (this.fulltext.getSolrScheme().contains(YaCySchema.fuzzy_signature_l) && this.fulltext.getSolrScheme().contains(YaCySchema.fuzzy_signature_unique_b)) || + this.fulltext.getSolrScheme().contains(YaCySchema.title_unique_b) || + this.fulltext.getSolrScheme().contains(YaCySchema.description_unique_b) + ) { + this.fulltext.getSolr().commit(true); // make sure that we have latest information for the postprocessing steps + } + } + // CREATE INDEX // load some document metadata @@ -368,13 +387,13 @@ public class Segment { for (YaCySchema[] checkfields: new YaCySchema[][]{ {YaCySchema.exact_signature_l, YaCySchema.exact_signature_unique_b}, {YaCySchema.fuzzy_signature_l, YaCySchema.fuzzy_signature_unique_b}}) { - YaCySchema hashfield = checkfields[0]; + YaCySchema checkfield = checkfields[0]; YaCySchema uniquefield = checkfields[1]; - if (this.fulltext.getSolrScheme().contains(hashfield) && this.fulltext.getSolrScheme().contains(uniquefield)) { + if (this.fulltext.getSolrScheme().contains(checkfield) && this.fulltext.getSolrScheme().contains(uniquefield)) { // lookup the document with the same signature - long signature = ((Long) solrInputDoc.getField(hashfield.getSolrFieldName()).getValue()).longValue(); + long signature = ((Long) solrInputDoc.getField(checkfield.getSolrFieldName()).getValue()).longValue(); try { - if (this.fulltext.getSolr().exists(hashfield.getSolrFieldName(), Long.toString(signature))) { + if (this.fulltext.getSolr().exists(checkfield.getSolrFieldName(), Long.toString(signature))) { // change unique attribut in content solrInputDoc.setField(uniquefield.getSolrFieldName(), false); } @@ -382,13 +401,44 @@ public class Segment { } } + // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) + uniquecheck: for (YaCySchema[] checkfields: new YaCySchema[][]{ + {YaCySchema.title, YaCySchema.title_unique_b}, + {YaCySchema.description, YaCySchema.description_unique_b}}) { + YaCySchema checkfield = checkfields[0]; + YaCySchema uniquefield = checkfields[1]; + if (this.fulltext.getSolrScheme().contains(checkfield) && this.fulltext.getSolrScheme().contains(uniquefield)) { + // lookup in the index for the same title + String checkstring = checkfield == YaCySchema.title ? document.dc_title() : document.dc_description(); + if (checkstring.length() == 0) { + solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + continue uniquecheck; + } + checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\""); + try { + if (this.fulltext.getSolr().exists(checkfield.getSolrFieldName(), checkstring)) { + // switch unique attribute in new document + solrInputDoc.setField(uniquefield.getSolrFieldName(), false); + // switch attribute also in all existing documents (which should be exactly only one!) + SolrDocumentList docs = this.fulltext.getSolr().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000, YaCySchema.id.getSolrFieldName()); + for (SolrDocument doc: docs) { + SolrInputDocument sid = ClientUtils.toSolrInputDocument(doc); + sid.setField(uniquefield.getSolrFieldName(), false); + this.fulltext.getSolr().add(sid); + } + } else { + solrInputDoc.setField(uniquefield.getSolrFieldName(), true); + } + } catch (IOException e) {} + } + } + // ENRICH DOCUMENT WITH RANKING INFORMATION if (this.urlCitationIndex != null && this.fulltext.getSolrScheme().contains(YaCySchema.references_i)) { int references = this.urlCitationIndex.count(url.hash()); if (references > 0) solrInputDoc.setField(YaCySchema.references_i.getSolrFieldName(), references); } - // STORE TO SOLR String error = null; tryloop: for (int i = 0; i < 20; i++) { @@ -399,7 +449,7 @@ public class Segment { } catch ( final IOException e ) { error = "failed to send " + urlNormalform + " to solr"; Log.logWarning("SOLR", error + e.getMessage()); - if (i == 10) this.fulltext.commit(); + if (i == 10) this.fulltext.commit(false); try {Thread.sleep(1000);} catch (InterruptedException e1) {} continue tryloop; } diff --git a/source/net/yacy/search/index/SolrConfiguration.java b/source/net/yacy/search/index/SolrConfiguration.java index 90be0c28b..076da8d48 100644 --- a/source/net/yacy/search/index/SolrConfiguration.java +++ b/source/net/yacy/search/index/SolrConfiguration.java @@ -645,6 +645,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable inboundLinks.remove(canonical); outboundLinks.remove(canonical); add(doc, YaCySchema.canonical_t, canonical.toNormalform(false)); + // set a flag if this is equal to sku + if (contains(YaCySchema.canonical_equal_sku_b) && canonical.equals(docurl)) { + add(doc, YaCySchema.canonical_equal_sku_b, true); + } } } diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 896afe5e5..95b81235d 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -210,6 +210,9 @@ public final class SearchEvent { .getFlagAcceptRemoteIndex())); final long start = System.currentTimeMillis(); + // do a soft commit for fresh results + query.getSegment().fulltext().commit(true); + // prepare a local RWI search // initialize a ranking process that is the target for data // that is generated concurrently from local and global search threads