From f5ca5cea447b08f582b8bb44bcb33973d16fe7f5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 19 Nov 2012 17:24:34 +0100 Subject: [PATCH] - added field options to all solr queries. This can be used to restrict the actual data which is fetched from solr. - used the new field options to reduce generic options like getting the load date or the count of search results. should increase overall speed - used the new field options to reduce overhead in the host browser during aquisition of links. - used the field options to make checking of links in crawler faster - if the crawler is paused, the crawl queue is not cleaned --- htroot/HostBrowser.java | 23 +++++++--- .../solr/connector/AbstractSolrConnector.java | 12 ++--- .../solr/connector/MirrorSolrConnector.java | 46 +++++++++---------- .../solr/connector/MultipleSolrConnector.java | 22 ++++----- .../solr/connector/RetrySolrConnector.java | 14 +++--- .../solr/connector/ShardSolrConnector.java | 12 ++--- .../solr/connector/SolrConnector.java | 17 ++++--- .../solr/connector/SolrServerConnector.java | 21 ++++++--- source/net/yacy/crawler/CrawlStacker.java | 9 ++-- source/net/yacy/data/DidYouMean.java | 3 +- .../parser/augment/AugmentParser.java | 2 +- .../kelondro/data/meta/URIMetadataRow.java | 6 +-- source/net/yacy/search/Switchboard.java | 2 +- source/net/yacy/search/index/Fulltext.java | 22 +++++++-- 14 files changed, 123 insertions(+), 88 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index abffd9a06..16180c5d1 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -53,7 +53,9 @@ import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class HostBrowser { - + + final static long TIMEOUT = 10000L; + public static enum StoreType { LINK, INDEX, ERROR; } @@ -141,7 +143,7 @@ public class HostBrowser { int maxcount = admin ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums // collect hosts from index - ReversibleScoreMap hostscore = fulltext.getSolr().getFacets("*:*", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName()); + ReversibleScoreMap hostscore = fulltext.getSolr().getFacets("*:*", maxcount, YaCySchema.host_s.getSolrFieldName()).get(YaCySchema.host_s.getSolrFieldName()); if (hostscore == null) hostscore = new ClusteredScoreMap(); // collect hosts from crawler @@ -151,7 +153,7 @@ public class HostBrowser { } // collect the errorurls - ReversibleScoreMap errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]", new String[]{YaCySchema.host_s.getSolrFieldName()}, maxcount).get(YaCySchema.host_s.getSolrFieldName()) : null; + ReversibleScoreMap errorscore = admin ? fulltext.getSolr().getFacets(YaCySchema.failreason_t.getSolrFieldName() + ":[* TO *]", maxcount, YaCySchema.host_s.getSolrFieldName()).get(YaCySchema.host_s.getSolrFieldName()) : null; if (errorscore == null) errorscore = new ClusteredScoreMap(); int c = 0; @@ -225,7 +227,15 @@ public class HostBrowser { q.append(" AND ").append(YaCySchema.url_paths_sxt.getSolrFieldName()).append(":[* TO *]"); } } - BlockingQueue docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, 10000, 100); + BlockingQueue docs = fulltext.getSolr().concurrentQuery(q.toString(), 0, 100000, TIMEOUT, 100, + YaCySchema.id.getSolrFieldName(), + YaCySchema.sku.getSolrFieldName(), + YaCySchema.failreason_t.getSolrFieldName(), + YaCySchema.inboundlinks_protocol_sxt.getSolrFieldName(), + YaCySchema.inboundlinks_urlstub_txt.getSolrFieldName(), + YaCySchema.outboundlinks_protocol_sxt.getSolrFieldName(), + YaCySchema.outboundlinks_urlstub_txt.getSolrFieldName() + ); SolrDocument doc; Set storedDocs = new HashSet(); Map errorDocs = new HashMap(); @@ -233,19 +243,20 @@ public class HostBrowser { Map> outboundHosts = new HashMap>(); int hostsize = 0; final List deleteIDs = new ArrayList(); - long timeout = System.currentTimeMillis() + 10000; + long timeout = System.currentTimeMillis() + TIMEOUT; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { String u = (String) doc.getFieldValue(YaCySchema.sku.getSolrFieldName()); String error = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName()); if (u.startsWith(path)) { if (delete) { - deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.name()))); + deleteIDs.add(ASCII.getBytes((String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()))); } else { if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error); } } else if (complete) { if (error == null) storedDocs.add(u); else if (admin) errorDocs.put(u, error); } + if ((complete || u.startsWith(path)) && !storedDocs.contains(u)) inboundLinks.add(u); // add the current link if (error == null) { hostsize++; // collect inboundlinks to browse the host diff --git a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java index a5222bcc5..443113d68 100644 --- a/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/AbstractSolrConnector.java @@ -45,14 +45,14 @@ public abstract class AbstractSolrConnector implements SolrConnector { public final static SolrQuery catchallQuery = new SolrQuery(); static { catchallQuery.setQuery("*:*"); - catchallQuery.setFields(YaCySchema.id.name()); + catchallQuery.setFields(YaCySchema.id.getSolrFieldName()); catchallQuery.setRows(1); catchallQuery.setStart(0); } public final static SolrQuery catchSuccessQuery = new SolrQuery(); static { catchSuccessQuery.setQuery("-" + YaCySchema.failreason_t.name() + ":[* TO *]"); - catchSuccessQuery.setFields(YaCySchema.id.name()); + catchSuccessQuery.setFields(YaCySchema.id.getSolrFieldName()); catchSuccessQuery.setRows(1); catchSuccessQuery.setStart(0); } @@ -61,7 +61,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { @Override public boolean exists(final String id) throws IOException { try { - final SolrDocument doc = get(id); + final SolrDocument doc = get(id, YaCySchema.id.getSolrFieldName()); return doc != null; } catch (final Throwable e) { log.warn(e); @@ -81,7 +81,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element */ @Override - public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize) { + public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final String ... fields) { final BlockingQueue queue = buffersize <= 0 ? new LinkedBlockingQueue() : new ArrayBlockingQueue(buffersize); final long endtime = System.currentTimeMillis() + maxtime; final Thread t = new Thread() { @@ -90,7 +90,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { int o = offset; while (System.currentTimeMillis() < endtime) { try { - SolrDocumentList sdl = query(querystring, o, pagesize); + SolrDocumentList sdl = query(querystring, o, pagesize, fields); for (SolrDocument d: sdl) { try {queue.put(d);} catch (InterruptedException e) {break;} } @@ -119,7 +119,7 @@ public abstract class AbstractSolrConnector implements SolrConnector { int o = offset; while (System.currentTimeMillis() < endtime) { try { - SolrDocumentList sdl = query(querystring, o, pagesize); + SolrDocumentList sdl = query(querystring, o, pagesize, YaCySchema.id.getSolrFieldName()); for (SolrDocument d: sdl) { try {queue.put((String) d.getFieldValue(YaCySchema.id.getSolrFieldName()));} catch (InterruptedException e) {break;} } diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index 51f25c44d..ac3031011 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -231,8 +231,8 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public SolrDocument get(String id) throws IOException { - SolrDocument doc = this.documentCache.get(id); + public SolrDocument get(final String id, final String ... fields) throws IOException { + SolrDocument doc = fields.length == 0 ? this.documentCache.get(id) : null; if (doc != null) { this.documentCache_Hit++; return doc; @@ -243,24 +243,22 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo return null; } missCache_Miss++; - if ((solr0 != null && ((doc = solr0.get(id)) != null)) || (solr1 != null && ((doc = solr1.get(id)) != null))) { + if ((solr0 != null && ((doc = solr0.get(id, fields)) != null)) || (solr1 != null && ((doc = solr1.get(id, fields)) != null))) { this.missCache.remove(id); this.hitCache.put(id, EXIST); this.hitCache_Insert++; - this.documentCache.put(id, doc); - this.documentCache_Insert++; + if (fields.length == 0) {this.documentCache.put(id, doc); this.documentCache_Insert++;} return doc; } // check if there is a autocommit problem if (this.hitCache.containsKey(id)) { // the document should be there, therefore make a commit and check again this.commit(); - if ((solr0 != null && ((doc = solr0.get(id)) != null)) || (solr1 != null && ((doc = solr1.get(id)) != null))) { + if ((solr0 != null && ((doc = solr0.get(id, fields)) != null)) || (solr1 != null && ((doc = solr1.get(id, fields)) != null))) { this.missCache.remove(id); this.hitCache.put(id, EXIST); this.hitCache_Insert++; - this.documentCache.put(id, doc); - this.documentCache_Insert++; + if (fields.length == 0) {this.documentCache.put(id, doc); this.documentCache_Insert++;} return doc; } } @@ -276,7 +274,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo */ @Override public void add(final SolrInputDocument solrdoc) throws IOException { - String id = (String) solrdoc.getFieldValue(YaCySchema.id.name()); + String id = (String) solrdoc.getFieldValue(YaCySchema.id.getSolrFieldName()); assert id != null; if (id == null) return; this.missCache.remove(id); @@ -300,47 +298,47 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo * @throws IOException */ @Override - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException { if (this.solr0 == null && this.solr1 == null) return new SolrDocumentList(); if (offset == 0 && count == 1 && querystring.startsWith("id:")) { final SolrDocumentList list = new SolrDocumentList(); - SolrDocument doc = get(querystring.charAt(3) == '"' ? querystring.substring(4, querystring.length() - 1) : querystring.substring(3)); + SolrDocument doc = get(querystring.charAt(3) == '"' ? querystring.substring(4, querystring.length() - 1) : querystring.substring(3), fields); list.add(doc); // no addToCache(list) here because that was already handlet in get(); return list; } if (this.solr0 != null && this.solr1 == null) { - SolrDocumentList list = this.solr0.query(querystring, offset, count); - addToCache(list); + SolrDocumentList list = this.solr0.query(querystring, offset, count, fields); + if (fields.length == 0) addToCache(list); return list; } if (this.solr1 != null && this.solr0 == null) { - SolrDocumentList list = this.solr1.query(querystring, offset, count); - addToCache(list); + SolrDocumentList list = this.solr1.query(querystring, offset, count, fields); + if (fields.length == 0) addToCache(list); return list; } // combine both lists SolrDocumentList l; - l = this.solr0.query(querystring, offset, count); + l = this.solr0.query(querystring, offset, count, fields); if (l.size() >= count) return l; // at this point we need to know how many results are in solr0 // compute this with a very bad hack; replace with better method later int size0 = 0; { //bad hack - TODO: replace - SolrDocumentList lHack = this.solr0.query(querystring, 0, Integer.MAX_VALUE); + SolrDocumentList lHack = this.solr0.query(querystring, 0, Integer.MAX_VALUE, fields); size0 = lHack.size(); } // now use the size of the first query to do a second query final SolrDocumentList list = new SolrDocumentList(); for (final SolrDocument d: l) list.add(d); - l = this.solr1.query(querystring, offset + l.size() - size0, count - l.size()); + l = this.solr1.query(querystring, offset + l.size() - size0, count - l.size(), fields); for (final SolrDocument d: l) list.add(d); // add caching - addToCache(list); + if (fields.length == 0) addToCache(list); return list; } @@ -422,16 +420,16 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo } @Override - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException { + public Map> getFacets(final String query, final int maxresults, final String ... fields) throws IOException { if (this.solr0 == null && this.solr1 == null) return new HashMap>(0); if (this.solr0 != null && this.solr1 == null) { - return this.solr0.getFacets(query, fields, maxresults); + return this.solr0.getFacets(query, maxresults, fields); } if (this.solr1 != null && this.solr0 == null) { - return this.solr1.getFacets(query, fields, maxresults); + return this.solr1.getFacets(query, maxresults, fields); } - Map> facets0 = this.solr0.getFacets(query, fields, maxresults); - Map> facets1 = this.solr1.getFacets(query, fields, maxresults); + Map> facets0 = this.solr0.getFacets(query, maxresults, fields); + Map> facets1 = this.solr1.getFacets(query, maxresults, fields); for (Map.Entry> facet0: facets0.entrySet()) { ReversibleScoreMap facet1 = facets1.remove(facet0.getKey()); if (facet1 == null) continue; diff --git a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java index c6efa96cf..a85ea485b 100644 --- a/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MultipleSolrConnector.java @@ -44,7 +44,7 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr private final SolrConnector solr; private int commitWithinMs; - public MultipleSolrConnector(final String url, int connections) throws IOException { + public MultipleSolrConnector(final String url, final int connections) throws IOException { this.solr = new RemoteSolrConnector(url); this.queue = new ArrayBlockingQueue(1000); this.worker = new AddWorker[connections]; @@ -91,7 +91,7 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr * @param c the maximum waiting time after a solr command until it is transported to the server */ @Override - public void setCommitWithinMs(int c) { + public void setCommitWithinMs(final int c) { this.commitWithinMs = c; this.solr.setCommitWithinMs(c); for (AddWorker w: this.worker) w.solr.setCommitWithinMs(c); @@ -129,12 +129,12 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public void delete(String id) throws IOException { + public void delete(final String id) throws IOException { this.solr.delete(id); } @Override - public void delete(List ids) throws IOException { + public void delete(final List ids) throws IOException { this.solr.delete(ids); } @@ -144,8 +144,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public SolrDocument get(String id) throws IOException { - return this.solr.get(id); + public SolrDocument get(final String id, final String ... fields) throws IOException { + return this.solr.get(id, fields); } @Override @@ -169,12 +169,12 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public SolrDocumentList query(String querystring, int offset, int count) throws IOException { - return this.solr.query(querystring, offset, count); + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException { + return this.solr.query(querystring, offset, count, fields); } @Override - public QueryResponse query(ModifiableSolrParams query) throws IOException, SolrException { + public QueryResponse query(final ModifiableSolrParams query) throws IOException, SolrException { return this.solr.query(query); } @@ -184,8 +184,8 @@ public class MultipleSolrConnector extends AbstractSolrConnector implements Solr } @Override - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException { - return this.solr.getFacets(query, fields, maxresults); + public Map> getFacets(final String query, final int maxresults, final String ... fields) throws IOException { + return this.solr.getFacets(query, maxresults, fields); } @Override diff --git a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java index 9ecbf2fe1..d0d0106f7 100644 --- a/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RetrySolrConnector.java @@ -145,11 +145,11 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public SolrDocument get(String id) throws IOException { + public SolrDocument get(final String id, final String ... fields) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.get(id); + return this.solrConnector.get(id, fields); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} @@ -180,11 +180,11 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.query(querystring, offset, count); + return this.solrConnector.query(querystring, offset, count, fields); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} @@ -195,7 +195,7 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public QueryResponse query(ModifiableSolrParams query) throws IOException, SolrException { + public QueryResponse query(final ModifiableSolrParams query) throws IOException, SolrException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { @@ -225,11 +225,11 @@ public class RetrySolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException { + public Map> getFacets(final String query, final int maxresults, final String ... fields) throws IOException { final long t = System.currentTimeMillis() + this.retryMaxTime; Throwable ee = null; while (System.currentTimeMillis() < t) try { - return this.solrConnector.getFacets(query, fields, maxresults); + return this.solrConnector.getFacets(query, maxresults, fields); } catch (final Throwable e) { ee = e; try {Thread.sleep(10);} catch (final InterruptedException e1) {} diff --git a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java index 51ff3e18d..9d5cc0216 100644 --- a/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ShardSolrConnector.java @@ -133,9 +133,9 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public SolrDocument get(String id) throws IOException { + public SolrDocument get(String id, final String ... fields) throws IOException { for (final SolrConnector connector: this.connectors) { - SolrDocument doc = connector.get(id); + SolrDocument doc = connector.get(id, fields); if (doc != null) return doc; } return null; @@ -172,7 +172,7 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon * @throws IOException */ @Override - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException { final SolrDocumentList list = new SolrDocumentList(); List t = new ArrayList(); for (final SolrConnector connector: this.connectors) { @@ -180,7 +180,7 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon @Override public void run() { try { - final SolrDocumentList l = connector.query(querystring, offset, count); + final SolrDocumentList l = connector.query(querystring, offset, count, fields); for (final SolrDocument d: l) { list.add(d); } @@ -228,10 +228,10 @@ public class ShardSolrConnector extends AbstractSolrConnector implements SolrCon } @Override - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException { + public Map> getFacets(String query, int maxresults, final String ... fields) throws IOException { Map> facets = new HashMap>(); for (final SolrConnector connector: this.connectors) { - Map> peer = connector.getFacets(query, fields, maxresults); + Map> peer = connector.getFacets(query, maxresults, fields); innerloop: for (Map.Entry> facet: facets.entrySet()) { ReversibleScoreMap peerfacet = peer.remove(facet.getKey()); if (peerfacet == null) continue innerloop; diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index e082436b5..8f05a7e50 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -107,10 +107,11 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * get a document from solr by given id * @param id + * @param fields list of fields * @return one result or null if no result exists * @throws IOException */ - public SolrDocument get(final String id) throws IOException; + public SolrDocument get(final String id, final String ... fields) throws IOException; /** * get a query result from solr @@ -122,10 +123,13 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * get a query result from solr * to get all results set the query String to "*:*" - * @param querystring + * @param querystring the solr query string + * @param offset the first result offset + * @param count number of wanted results + * @param fields list of fields * @throws IOException */ - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException, SolrException; + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException, SolrException; /** * get the number of results when this query is done. @@ -138,12 +142,12 @@ public interface SolrConnector extends Iterable /* Iterable of document /** * get facets of the index: a list of lists with values that are most common in a specific field * @param query a query which is performed to get the facets - * @param fields the field names which are selected as facet * @param maxresults the maximum size of the resulting maps + * @param fields the field names which are selected as facet * @return a map with key = facet field name, value = an ordered map of field values for that field * @throws IOException */ - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException; + public Map> getFacets(String query, int maxresults, final String ... fields) throws IOException; /** * Get a query result from solr as a stream of documents. @@ -154,9 +158,10 @@ public interface SolrConnector extends Iterable /* Iterable of document * @param maxcount the maximum number of results * @param maxtime the maximum time in milliseconds * @param buffersize the size of an ArrayBlockingQueue; if <= 0 then a LinkedBlockingQueue is used + * @param fields list of fields * @return a blocking queue which is terminated with AbstractSolrConnector.POISON_DOCUMENT as last element */ - public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize); + public BlockingQueue concurrentQuery(final String querystring, final int offset, final int maxcount, final long maxtime, final int buffersize, final String ... fields); /** * get a document id result stream from a solr query. diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index 8e16fa3fe..cd0e443c5 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap; @@ -40,14 +41,12 @@ import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest; import org.apache.solr.client.solrj.response.FacetField; import org.apache.solr.client.solrj.response.FacetField.Count; import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.FacetParams; import org.apache.solr.common.params.ModifiableSolrParams; -import org.apache.solr.common.util.NamedList; public abstract class SolrServerConnector extends AbstractSolrConnector implements SolrConnector { @@ -203,6 +202,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen if (this.server == null) return; try { synchronized (this.server) { + //this.server.deleteById((String) solrdoc.getFieldValue(YaCySchema.id.getSolrFieldName())); this.server.add(solrdoc, this.commitWithinMs); //this.server.commit(); } @@ -215,9 +215,12 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen @Override public void add(final Collection solrdocs) throws IOException, SolrException { ArrayList l = new ArrayList(); - for (SolrInputDocument d: solrdocs) l.add(d); try { synchronized (this.server) { + for (SolrInputDocument d: solrdocs) { + //this.server.deleteById((String) d.getFieldValue(YaCySchema.id.getSolrFieldName())); + l.add(d); + } this.server.add(l, this.commitWithinMs); //this.server.commit(); } @@ -234,7 +237,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @throws IOException */ @Override - public SolrDocumentList query(final String querystring, final int offset, final int count) throws IOException { + public SolrDocumentList query(final String querystring, final int offset, final int count, final String ... fields) throws IOException { // construct query final SolrQuery params = new SolrQuery(); params.setQuery(querystring); @@ -243,6 +246,8 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen params.setFacet(false); //params.addSortField( "price", SolrQuery.ORDER.asc ); + if (fields.length > 0) params.setFields(fields); + // query the server QueryResponse rsp = query(params); final SolrDocumentList docs = rsp.getResults(); @@ -263,6 +268,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen params.setRows(0); params.setStart(0); params.setFacet(false); + params.setFields(YaCySchema.id.getSolrFieldName()); // query the server QueryResponse rsp = query(params); @@ -278,8 +284,9 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @return a map with key = facet field name, value = an ordered map of field values for that field * @throws IOException */ - public Map> getFacets(String query, String[] fields, int maxresults) throws IOException { + public Map> getFacets(String query, int maxresults, final String ... fields) throws IOException { // construct query + assert fields.length > 0; final SolrQuery params = new SolrQuery(); params.setQuery(query); params.setRows(0); @@ -287,6 +294,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen params.setFacet(true); params.setFacetLimit(maxresults); params.setFacetSort(FacetParams.FACET_SORT_COUNT); + params.setFields(fields); for (String field: fields) params.addFacetField(field); // query the server @@ -315,7 +323,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen * @throws IOException */ @Override - public SolrDocument get(final String id) throws IOException { + public SolrDocument get(final String id, final String ... fields) throws IOException { assert id.length() == 12; // construct query char[] q = new char[17]; @@ -325,6 +333,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen query.setQuery(new String(q)); query.setRows(1); query.setStart(0); + if (fields.length > 0) query.setFields(fields); // query the server try { diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index fee09e310..b97b49e97 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -60,7 +60,6 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.SMBLoader; import net.yacy.crawler.robots.RobotsTxt; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.SeedDB; @@ -448,8 +447,8 @@ public final class CrawlStacker { // check if the url is double registered final String dbocc = this.nextQueue.urlExists(url.hash()); // returns the name of the queue if entry exists - final URIMetadataNode oldEntry = this.indexSegment.fulltext().getMetadata(url.hash()); - if (oldEntry == null) { + final Date oldDate = this.indexSegment.fulltext().getLoadDate(ASCII.String(url.hash())); + if (oldDate == null) { if (dbocc != null) { // do double-check if (dbocc.equals("errors")) { @@ -459,11 +458,11 @@ public final class CrawlStacker { return "double in: " + dbocc; } } else { - final boolean recrawl = profile.recrawlIfOlder() > oldEntry.loaddate().getTime(); + final boolean recrawl = profile.recrawlIfOlder() > oldDate.getTime(); if (recrawl) { if (this.log.isInfo()) this.log.logInfo("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + - ((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000 / 60 / 24) + " days ago."); + ((System.currentTimeMillis() - oldDate.getTime()) / 60000 / 60 / 24) + " days ago."); } else { if (dbocc == null) { return "double in: LURL-DB"; diff --git a/source/net/yacy/data/DidYouMean.java b/source/net/yacy/data/DidYouMean.java index 4aba38521..165334db1 100644 --- a/source/net/yacy/data/DidYouMean.java +++ b/source/net/yacy/data/DidYouMean.java @@ -134,8 +134,7 @@ public class DidYouMean { * @return */ public SortedSet getSuggestions(final long timeout, final int preSortSelection) { - if (this.word.length() < MinimumInputWordLength) - { + if (this.word.length() < MinimumInputWordLength) { return this.resultSet; // return nothing if input is too short } final long startTime = System.currentTimeMillis(); diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 872dfce73..3b1a26cf6 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -63,7 +63,7 @@ public class AugmentParser extends AbstractParser implements Parser { } } */ - private void parseAndAugment(Document origDoc, DigestURI url, String mimeType, String charset) { + private void parseAndAugment(Document origDoc, DigestURI url, @SuppressWarnings("unused") String mimeType, @SuppressWarnings("unused") String charset) { Iterator it; try { diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index f2f29b241..185868414 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -513,18 +513,16 @@ public class URIMetadataRow { final int p = this.latlon.indexOf(','); if (p < 0) { return 0.0d; - } else { // old index entries might contain text "NaN,NaN" - return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); } + return this.latlon.charAt(0) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(0, p)); } public double lon() { if (this.latlon == null || this.latlon.isEmpty()) return 0.0d; final int p = this.latlon.indexOf(','); if (p < 0) { return 0.0d; - } else { // old index entries might contain text "NaN,NaN" - return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); } + return this.latlon.charAt(p + 1) > '9' ? 0.0d : Double.parseDouble(this.latlon.substring(p + 1)); } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index a0812cdd6..35fed436c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2126,7 +2126,7 @@ public final class Switchboard extends serverSwitch { // clean up profiles checkInterruption(); //cleanProfiles(); - int cleanup = this.crawler.cleanFinishesProfiles(this.crawlQueues); + int cleanup = this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) ? 0 : this.crawler.cleanFinishesProfiles(this.crawlQueues); if (cleanup > 0) log.logInfo("cleanup removed " + cleanup + " crawl profiles"); // clean up news diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 1404432f8..98cacb8f6 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -42,6 +42,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.federate.solr.SolrType; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; @@ -213,6 +214,21 @@ public final class Fulltext implements Iterable { this.forcedCommitTime = System.currentTimeMillis(); // set the exact time } + public Date getLoadDate(final String urlHash) { + if (urlHash == null) return null; + SolrDocument doc; + try { + doc = this.solr.get(urlHash, YaCySchema.load_date_dt.getSolrFieldName()); + } catch (IOException e) { + return null; + } + if (doc == null) return null; + Date x = (Date) doc.getFieldValue(YaCySchema.load_date_dt.getSolrFieldName()); + if (x == null) return new Date(0); + Date now = new Date(); + return x.after(now) ? now : x; + } + /** * generates an plasmaLURLEntry using the url hash * if the url cannot be found, this returns null @@ -259,7 +275,7 @@ public final class Fulltext implements Iterable { } public void putDocument(final SolrInputDocument doc) throws IOException { - String id = (String) doc.getFieldValue(YaCySchema.id.name()); + String id = (String) doc.getFieldValue(YaCySchema.id.getSolrFieldName()); byte[] idb = ASCII.getBytes(id); try { if (this.urlIndexFile != null) this.urlIndexFile.remove(idb); @@ -385,7 +401,7 @@ public final class Fulltext implements Iterable { final AtomicInteger count = new AtomicInteger(0); Thread t = new Thread(){ public void run() { - final BlockingQueue docs = getSolr().concurrentQuery(q, 0, 1000000, 600000, -1); + final BlockingQueue docs = getSolr().concurrentQuery(q, 0, 1000000, 600000, -1, YaCySchema.id.getSolrFieldName(), YaCySchema.sku.getSolrFieldName()); try { SolrDocument doc; while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { @@ -464,7 +480,7 @@ public final class Fulltext implements Iterable { public String failReason(final String urlHash) throws IOException { if (urlHash == null) return null; - SolrDocument doc = this.solr.get(urlHash); + SolrDocument doc = this.solr.get(urlHash, YaCySchema.failreason_t.getSolrFieldName()); if (doc == null) return null; String reason = (String) doc.getFieldValue(YaCySchema.failreason_t.getSolrFieldName()); return reason == null ? null : reason.length() == 0 ? null : reason;