diff --git a/defaults/yacy.init b/defaults/yacy.init index 4671e41b7..3dea18852 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1095,7 +1095,7 @@ federated.service.solr.indexing.url = http://127.0.0.1:8983/solr federated.service.solr.indexing.sharding = MODULO_HOST_MD5 # the lazy attribute causes that fields containing "" or 0 are not added and not written federated.service.solr.indexing.lazy = true -federated.service.solr.indexing.timeout = 10000 +federated.service.solr.indexing.timeout = 6000 # temporary definition of backend services to use. # After the migration a rwi+solr combination is used, the solr contains the content of the previously used metadata-db. diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 01027ad47..814d3fa91 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -31,7 +31,6 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Arrays; import java.util.Collection; -import java.util.Enumeration; import java.util.Iterator; import java.util.Map; import net.yacy.cora.document.encoding.ASCII; @@ -290,17 +289,21 @@ public class ViewFile { // Search word highlighting for (final StringBuilder s: sentences) { sentence = s.toString(); - Enumeration tokens = null; - tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib); - while (tokens.hasMoreElements()) { - token = tokens.nextElement(); - if (token.length() > 0) { - prop.put("viewMode_words_" + i + "_nr", i + 1); - prop.put("viewMode_words_" + i + "_word", token.toString()); - prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0"); - dark = !dark; - i++; + WordTokenizer tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib); + try { + while (tokens.hasMoreElements()) { + token = tokens.nextElement(); + if (token.length() > 0) { + prop.put("viewMode_words_" + i + "_nr", i + 1); + prop.put("viewMode_words_" + i + "_word", token.toString()); + prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0"); + dark = !dark; + i++; + } } + } finally { + tokens.close(); + tokens = null; } } } diff --git a/source/net/yacy/cora/order/Digest.java b/source/net/yacy/cora/order/Digest.java index 619beaab8..8ea4d8215 100644 --- a/source/net/yacy/cora/order/Digest.java +++ b/source/net/yacy/cora/order/Digest.java @@ -28,14 +28,15 @@ import java.io.InputStream; import java.io.RandomAccessFile; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Queue; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; -import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.document.encoding.UTF8; @@ -48,7 +49,7 @@ import net.yacy.cora.util.Memory; public class Digest { - public static BlockingQueue digestPool = new LinkedBlockingDeque(); + public static Queue digestPool = new ConcurrentLinkedQueue(); private static final int md5CacheSize = Math.max(1000, Math.min(1000000, (int) (Memory.available() / 50000L))); private static ARC md5Cache = null; @@ -138,11 +139,8 @@ public class Digest { digest.update(keyBytes); final byte[] result = digest.digest(); digest.reset(); // to be prepared for next - try { - digestPool.put(digest); - //System.out.println("Digest Pool size = " + digestPool.size()); - } catch (final InterruptedException e ) { - } + digestPool.add(digest); + //System.out.println("Digest Pool size = " + digestPool.size()); // update the cache md5Cache.insertIfAbsent(key, result); // prevent expensive MD5 computation and encoding diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 2b35a08fa..25d9eb9a1 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -95,84 +95,85 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle buffer.append(document.dc_title().toLowerCase()); for (String s:document.dc_description()) buffer.append(s.toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase()); - final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib); - try { - int score = 0; + int score = 0; - // get phrases - final TreeMap phrases = getPhrases(document, 2); - phrases.putAll(getPhrases(document, 3)); - final Iterator iter = phrases.keySet().iterator(); - while(iter.hasNext()) { + // get phrases + final TreeMap phrases = getPhrases(document, 2); + phrases.putAll(getPhrases(document, 3)); + final Iterator iter = phrases.keySet().iterator(); + while(iter.hasNext()) { + score = 10; + final String phrase = iter.next(); + if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) { + score = phrases.get(phrase).size() * phrase.split(" ").length * 20; + } + if(isDigitSpace(phrase)) { score = 10; - final String phrase = iter.next(); - if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) { - score = phrases.get(phrase).size() * phrase.split(" ").length * 20; - } - if(isDigitSpace(phrase)) { - score = 10; - } - if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) { - score = score * 10; - } - if (tags.containsKey(phrase)) { - score = score * 20; - } - topwords.add(new YMarkTag(phrase, score)); - pwords.append(phrase); - pwords.append(' '); } - - // loop through potential tag and rank them - while(tokens.hasMoreElements()) { - score = 0; - token = tokens.nextElement(); - - // check if the token appears in the text - if (words.containsKey(token.toString())) { - final Word word = words.get(token.toString()); - // token appears in text and matches an existing bookmark tag - if (tags.containsKey(token.toString())) { - score = word.occurrences() * tags.get(token.toString()).size() * 200; - } - // token appears in text and has more than 3 characters - else if (token.length()>3) { - score = word.occurrences() * 100; - } - // if token is already part of a phrase, reduce score - if(pwords.toString().indexOf(token.toString())>1) { - score = score / 3; - } - topwords.add(new YMarkTag(token.toString(), score)); - } + if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) { + score = score * 10; } - score = 0; - buffer.setLength(0); - for(final YMarkTag tag : topwords) { - if(score < max) { - if(tag.size() > 100) { - buffer.append(tag.name()); - buffer.append(YMarkUtil.TAGS_SEPARATOR); - score++; - } - } else { - break; - } + if (tags.containsKey(phrase)) { + score = score * 20; } - final String clean = YMarkUtil.cleanTagsString(buffer.toString()); - if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { - return MultiProtocolURL.getFileExtension(document.dc_source().getFileName()); + topwords.add(new YMarkTag(phrase, score)); + pwords.append(phrase); + pwords.append(' '); + } + + // loop through potential tag and rank them + WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib); + try { + while (tokens.hasMoreElements()) { + score = 0; + token = tokens.nextElement(); + + // check if the token appears in the text + if (words.containsKey(token.toString())) { + final Word word = words.get(token.toString()); + // token appears in text and matches an existing bookmark tag + if (tags.containsKey(token.toString())) { + score = word.occurrences() * tags.get(token.toString()).size() * 200; + } + // token appears in text and has more than 3 characters + else if (token.length()>3) { + score = word.occurrences() * 100; + } + // if token is already part of a phrase, reduce score + if(pwords.toString().indexOf(token.toString())>1) { + score = score / 3; + } + topwords.add(new YMarkTag(token.toString(), score)); + } + } + } finally { + tokens.close(); + tokens = null; + } + score = 0; + buffer.setLength(0); + for(final YMarkTag tag : topwords) { + if(score < max) { + if(tag.size() > 100) { + buffer.append(tag.name()); + buffer.append(YMarkUtil.TAGS_SEPARATOR); + score++; + } + } else { + break; } - return clean; - } finally { - tokens.close(); } + final String clean = YMarkUtil.cleanTagsString(buffer.toString()); + if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { + return MultiProtocolURL.getFileExtension(document.dc_source().getFileName()); + } + return clean; } private static TreeMap getPhrases(final Document document, final int size) { final TreeMap phrases = new TreeMap(); final StringBuilder phrase = new StringBuilder(128); - final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib); + WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib); try { StringBuilder token; int count = 0; @@ -206,6 +207,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle return phrases; } finally { tokens.close(); + tokens = null; } } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 21e2ab26e..2d19abdbe 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -285,6 +285,7 @@ public final class Condenser { } } finally { wordenum.close(); + wordenum = null; } } @@ -345,7 +346,7 @@ public final class Condenser { if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false; // read source - final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); + WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); try { while (wordenum.hasMoreElements()) { word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); @@ -420,6 +421,7 @@ public final class Condenser { } } finally { wordenum.close(); + wordenum = null; } if (pseudostemming) { diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index 91fd5e36e..18317bcaf 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -53,25 +53,12 @@ public class SentenceReader implements Iterator, Iterable= this.text.length()) break; - nextChar = this.text.charAt(this.pos++); - //System.out.print((char) nextChar); // DEBUG - if (nextChar < 0) { - break; - } + while (this.pos < this.text.length() && (nextChar = this.text.charAt(this.pos++)) > 0) { c = (char) nextChar; if (this.pre && (nextChar == 10 || nextChar == 13)) break; if (c < ' ') c = ' '; diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 0dc9351e7..8614c10ff 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -39,7 +39,7 @@ public class WordTokenizer implements Enumeration { // this enumeration removes all words that contain either wrong characters or are too short private StringBuilder buffer = null; - private final unsievedWordsEnum e; + private unsievedWordsEnum e; private final WordCache meaningLib; public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) { @@ -82,13 +82,15 @@ public class WordTokenizer implements Enumeration { public synchronized void close() { this.e.close(); + this.e = null; + this.buffer = null; } private static class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuilder Objects private StringBuilder buffer = null; - private final SentenceReader sr; - private final List s; + private SentenceReader sr; + private List s; private int sIndex; public unsievedWordsEnum(final SentenceReader sr0) { @@ -152,7 +154,11 @@ public class WordTokenizer implements Enumeration { } public synchronized void close() { + this.sIndex = 0; + this.s.clear(); + this.s = null; this.sr.close(); + this.sr = null; } } @@ -181,7 +187,7 @@ public class WordTokenizer implements Enumeration { */ public static SortedMap hashSentence(final String sentence, final WordCache meaningLib, int maxlength) { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); - final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib); + WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib); try { int pos = 0; StringBuilder word; @@ -202,6 +208,7 @@ public class WordTokenizer implements Enumeration { return map; } finally { words.close(); + words = null; } } } diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 3075158a3..b084cf8fa 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -902,10 +902,12 @@ public final class Protocol { Map resultMap = null; String key = ""; final ContentBody keyBody = parts.get("key"); - if ( keyBody != null ) { - final ByteArrayOutputStream baos = new ByteArrayOutputStream(20); + if (keyBody != null) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(20); keyBody.writeTo(baos); - key = baos.toString(); + key = UTF8.String(baos.toByteArray()); + baos.close(); + baos = null; } String filter = event.query.urlMask.pattern().toString(); @@ -1037,67 +1039,70 @@ public final class Protocol { RemoteInstance instance = null; SolrConnector solrConnector = null; SolrDocumentList docList = null; - QueryResponse rsp = null; - if (localsearch) { - // search the local index - try { - rsp = event.getQuery().getSegment().fulltext().getDefaultConnector().getResponseByParams(solrQuery); - docList = rsp.getResults(); - } catch (final Throwable e) { - Network.log.info("SEARCH failed (solr), localpeer (" + e.getMessage() + ")", e); - return -1; - } - } else { - try { - String address = target == event.peers.mySeed() ? "localhost:" + target.getPort() : target.getPublicAddress(); - final int solrtimeout = Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 10000); - instance = new RemoteInstance("http://" + address, null, "solr", solrtimeout); // this is a 'patch configuration' which considers 'solr' as default collection - solrConnector = new RemoteSolrConnector(instance, "solr"); - rsp = solrConnector.getResponseByParams(solrQuery); - docList = rsp.getResults(); - solrConnector.close(); - instance.close(); - // no need to close this here because that sends a commit to remote solr which is not wanted here - } catch (final Throwable e) { - Network.log.info("SEARCH failed (solr), remote Peer: " +target.getName() + "/" + target.getPublicAddress() + " (" + e.getMessage() + ")"); - return -1; - } - } - - // evaluate facets Map> facets = new HashMap>(event.query.facetfields.size()); - for (String field: event.query.facetfields) { - FacetField facet = rsp.getFacetField(field); - ReversibleScoreMap result = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); - List values = facet == null ? null : facet.getValues(); - if (values == null) continue; - for (Count ff: values) { - int c = (int) ff.getCount(); - if (c == 0) continue; - result.set(ff.getName(), c); - } - if (result.size() > 0) facets.put(field, result); - } - - // evaluate snippets - Map>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets Map snippets = new HashMap(); // this will be a list of urlhash-snippet entries - if (rawsnippets != null) { - nextsnippet: for (Map.Entry>> re: rawsnippets.entrySet()) { - Map> rs = re.getValue(); - for (CollectionSchema field: snippetFields) { - if (rs.containsKey(field.getSolrFieldName())) { - List s = rs.get(field.getSolrFieldName()); - if (s.size() > 0) { - snippets.put(re.getKey(), s.get(0)); - continue nextsnippet; + {// encapsulate expensive solr QueryResponse object + QueryResponse rsp = null; + if (localsearch) { + // search the local index + try { + rsp = event.getQuery().getSegment().fulltext().getDefaultConnector().getResponseByParams(solrQuery); + docList = rsp.getResults(); + } catch (final Throwable e) { + Network.log.info("SEARCH failed (solr), localpeer (" + e.getMessage() + ")", e); + return -1; + } + } else { + try { + String address = target == event.peers.mySeed() ? "localhost:" + target.getPort() : target.getPublicAddress(); + final int solrtimeout = Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 6000); + instance = new RemoteInstance("http://" + address, null, "solr", solrtimeout); // this is a 'patch configuration' which considers 'solr' as default collection + solrConnector = new RemoteSolrConnector(instance, "solr"); + rsp = solrConnector.getResponseByParams(solrQuery); + docList = rsp.getResults(); + solrConnector.close(); + instance.close(); + // no need to close this here because that sends a commit to remote solr which is not wanted here + } catch (final Throwable e) { + Network.log.info("SEARCH failed (solr), remote Peer: " +target.getName() + "/" + target.getPublicAddress() + " (" + e.getMessage() + ")"); + return -1; + } + } + + // evaluate facets + for (String field: event.query.facetfields) { + FacetField facet = rsp.getFacetField(field); + ReversibleScoreMap result = new ClusteredScoreMap(UTF8.insensitiveUTF8Comparator); + List values = facet == null ? null : facet.getValues(); + if (values == null) continue; + for (Count ff: values) { + int c = (int) ff.getCount(); + if (c == 0) continue; + result.set(ff.getName(), c); + } + if (result.size() > 0) facets.put(field, result); + } + + // evaluate snippets + Map>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets + if (rawsnippets != null) { + nextsnippet: for (Map.Entry>> re: rawsnippets.entrySet()) { + Map> rs = re.getValue(); + for (CollectionSchema field: snippetFields) { + if (rs.containsKey(field.getSolrFieldName())) { + List s = rs.get(field.getSolrFieldName()); + if (s.size() > 0) { + snippets.put(re.getKey(), s.get(0)); + continue nextsnippet; + } } } + // no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method } - // no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method } + rsp = null; } - + // evaluate result List container = new ArrayList(); if (docList == null || docList.size() == 0) { @@ -1164,24 +1169,25 @@ public final class Protocol { // add the url entry to the word indexes container.add(urlEntry); } + final int dls = docList.size(); + final int numFound = (int) docList.getNumFound(); + docList.clear(); + docList = null; if (localsearch) { - event.addNodes(container, facets, snippets, true, "localpeer", (int) docList.getNumFound()); + event.addNodes(container, facets, snippets, true, "localpeer", numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); - Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + docList.getNumFound() + " references"); + Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + numFound + " references"); } else { for (SolrInputDocument doc: docs) { event.query.getSegment().putDocumentInQueue(doc); } docs.clear(); docs = null; - event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, (int) docList.getNumFound()); + event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); - Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (container.size() == 0 ? 0 : container.size()) + "/" + docList.getNumFound() + " references"); + Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (container.size() == 0 ? 0 : container.size()) + "/" + numFound + " references"); } - final int dls = docList.size(); - docList.clear(); - docList = null; if (solrConnector != null) solrConnector.close(); if (instance != null) instance.close(); return dls; diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 85e0bbc4f..071675708 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -191,14 +191,17 @@ public class TextSnippet implements Comparable, Comparator(); while (sr.hasNext()) { sentences.add(sr.next()); } + sr.close(); + sr = null; + solrText = null; } else if (net.yacy.crawler.data.Cache.has(url.hash())) { // get the sentences from the cache final Request request = loader == null ? null : loader.request(url, true, reindexing); @@ -213,6 +216,8 @@ public class TextSnippet implements Comparable, Comparator, Comparator 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null); return; } + sentences = null; // we don't need this here any more // try to load the resource from the cache Response response = null; @@ -311,6 +317,7 @@ public class TextSnippet implements Comparable, Comparator