hacks to prevent storage of data longer than necessary during search and

some speed enhancements. This should reduce the memory usage during
heavy-load search a bit.
pull/1/head
Michael Peter Christen 11 years ago
parent 3c3cb78555
commit 9bb7eab389

@ -1095,7 +1095,7 @@ federated.service.solr.indexing.url = http://127.0.0.1:8983/solr
federated.service.solr.indexing.sharding = MODULO_HOST_MD5
# the lazy attribute causes that fields containing "" or 0 are not added and not written
federated.service.solr.indexing.lazy = true
federated.service.solr.indexing.timeout = 10000
federated.service.solr.indexing.timeout = 6000
# temporary definition of backend services to use.
# After the migration a rwi+solr combination is used, the solr contains the content of the previously used metadata-db.

@ -31,7 +31,6 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII;
@ -290,17 +289,21 @@ public class ViewFile {
// Search word highlighting
for (final StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<StringBuilder> tokens = null;
tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token.toString());
prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
dark = !dark;
i++;
WordTokenizer tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
try {
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 0) {
prop.put("viewMode_words_" + i + "_nr", i + 1);
prop.put("viewMode_words_" + i + "_word", token.toString());
prop.put("viewMode_words_" + i + "_dark", dark ? "1" : "0");
dark = !dark;
i++;
}
}
} finally {
tokens.close();
tokens = null;
}
}
}

@ -28,14 +28,15 @@ import java.io.InputStream;
import java.io.RandomAccessFile;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.Queue;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.LinkedBlockingDeque;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.encoding.UTF8;
@ -48,7 +49,7 @@ import net.yacy.cora.util.Memory;
public class Digest {
public static BlockingQueue<MessageDigest> digestPool = new LinkedBlockingDeque<MessageDigest>();
public static Queue<MessageDigest> digestPool = new ConcurrentLinkedQueue<MessageDigest>();
private static final int md5CacheSize = Math.max(1000, Math.min(1000000, (int) (Memory.available() / 50000L)));
private static ARC<String, byte[]> md5Cache = null;
@ -138,11 +139,8 @@ public class Digest {
digest.update(keyBytes);
final byte[] result = digest.digest();
digest.reset(); // to be prepared for next
try {
digestPool.put(digest);
//System.out.println("Digest Pool size = " + digestPool.size());
} catch (final InterruptedException e ) {
}
digestPool.add(digest);
//System.out.println("Digest Pool size = " + digestPool.size());
// update the cache
md5Cache.insertIfAbsent(key, result); // prevent expensive MD5 computation and encoding

@ -95,84 +95,85 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title().toLowerCase());
for (String s:document.dc_description()) buffer.append(s.toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try {
int score = 0;
int score = 0;
// get phrases
final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
phrases.putAll(getPhrases(document, 3));
final Iterator<String> iter = phrases.keySet().iterator();
while(iter.hasNext()) {
// get phrases
final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
phrases.putAll(getPhrases(document, 3));
final Iterator<String> iter = phrases.keySet().iterator();
while(iter.hasNext()) {
score = 10;
final String phrase = iter.next();
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
}
if(isDigitSpace(phrase)) {
score = 10;
final String phrase = iter.next();
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
}
if(isDigitSpace(phrase)) {
score = 10;
}
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
score = score * 10;
}
if (tags.containsKey(phrase)) {
score = score * 20;
}
topwords.add(new YMarkTag(phrase, score));
pwords.append(phrase);
pwords.append(' ');
}
// loop through potential tag and rank them
while(tokens.hasMoreElements()) {
score = 0;
token = tokens.nextElement();
// check if the token appears in the text
if (words.containsKey(token.toString())) {
final Word word = words.get(token.toString());
// token appears in text and matches an existing bookmark tag
if (tags.containsKey(token.toString())) {
score = word.occurrences() * tags.get(token.toString()).size() * 200;
}
// token appears in text and has more than 3 characters
else if (token.length()>3) {
score = word.occurrences() * 100;
}
// if token is already part of a phrase, reduce score
if(pwords.toString().indexOf(token.toString())>1) {
score = score / 3;
}
topwords.add(new YMarkTag(token.toString(), score));
}
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
score = score * 10;
}
score = 0;
buffer.setLength(0);
for(final YMarkTag tag : topwords) {
if(score < max) {
if(tag.size() > 100) {
buffer.append(tag.name());
buffer.append(YMarkUtil.TAGS_SEPARATOR);
score++;
}
} else {
break;
}
if (tags.containsKey(phrase)) {
score = score * 20;
}
final String clean = YMarkUtil.cleanTagsString(buffer.toString());
if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
return MultiProtocolURL.getFileExtension(document.dc_source().getFileName());
topwords.add(new YMarkTag(phrase, score));
pwords.append(phrase);
pwords.append(' ');
}
// loop through potential tag and rank them
WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try {
while (tokens.hasMoreElements()) {
score = 0;
token = tokens.nextElement();
// check if the token appears in the text
if (words.containsKey(token.toString())) {
final Word word = words.get(token.toString());
// token appears in text and matches an existing bookmark tag
if (tags.containsKey(token.toString())) {
score = word.occurrences() * tags.get(token.toString()).size() * 200;
}
// token appears in text and has more than 3 characters
else if (token.length()>3) {
score = word.occurrences() * 100;
}
// if token is already part of a phrase, reduce score
if(pwords.toString().indexOf(token.toString())>1) {
score = score / 3;
}
topwords.add(new YMarkTag(token.toString(), score));
}
}
} finally {
tokens.close();
tokens = null;
}
score = 0;
buffer.setLength(0);
for(final YMarkTag tag : topwords) {
if(score < max) {
if(tag.size() > 100) {
buffer.append(tag.name());
buffer.append(YMarkUtil.TAGS_SEPARATOR);
score++;
}
} else {
break;
}
return clean;
} finally {
tokens.close();
}
final String clean = YMarkUtil.cleanTagsString(buffer.toString());
if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
return MultiProtocolURL.getFileExtension(document.dc_source().getFileName());
}
return clean;
}
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128);
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
try {
StringBuilder token;
int count = 0;
@ -206,6 +207,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return phrases;
} finally {
tokens.close();
tokens = null;
}
}

@ -285,6 +285,7 @@ public final class Condenser {
}
} finally {
wordenum.close();
wordenum = null;
}
}
@ -345,7 +346,7 @@ public final class Condenser {
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
// read source
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try {
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
@ -420,6 +421,7 @@ public final class Condenser {
}
} finally {
wordenum.close();
wordenum = null;
}
if (pseudostemming) {

@ -53,25 +53,12 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
}
private StringBuilder nextElement0() {
final StringBuilder s = readSentence();
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
if (s == null) return null;
return s;
}
private StringBuilder readSentence() {
final StringBuilder s = new StringBuilder(80);
int nextChar;
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
// find sentence end
while (true) {
if (this.pos >= this.text.length()) break;
nextChar = this.text.charAt(this.pos++);
//System.out.print((char) nextChar); // DEBUG
if (nextChar < 0) {
break;
}
while (this.pos < this.text.length() && (nextChar = this.text.charAt(this.pos++)) > 0) {
c = (char) nextChar;
if (this.pre && (nextChar == 10 || nextChar == 13)) break;
if (c < ' ') c = ' ';

@ -39,7 +39,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
// this enumeration removes all words that contain either wrong characters or are too short
private StringBuilder buffer = null;
private final unsievedWordsEnum e;
private unsievedWordsEnum e;
private final WordCache meaningLib;
public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) {
@ -82,13 +82,15 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
public synchronized void close() {
this.e.close();
this.e = null;
this.buffer = null;
}
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null;
private final SentenceReader sr;
private final List<StringBuilder> s;
private SentenceReader sr;
private List<StringBuilder> s;
private int sIndex;
public unsievedWordsEnum(final SentenceReader sr0) {
@ -152,7 +154,11 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
}
public synchronized void close() {
this.sIndex = 0;
this.s.clear();
this.s = null;
this.sr.close();
this.sr = null;
}
}
@ -181,7 +187,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
try {
int pos = 0;
StringBuilder word;
@ -202,6 +208,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return map;
} finally {
words.close();
words = null;
}
}
}

@ -902,10 +902,12 @@ public final class Protocol {
Map<String, String> resultMap = null;
String key = "";
final ContentBody keyBody = parts.get("key");
if ( keyBody != null ) {
final ByteArrayOutputStream baos = new ByteArrayOutputStream(20);
if (keyBody != null) {
ByteArrayOutputStream baos = new ByteArrayOutputStream(20);
keyBody.writeTo(baos);
key = baos.toString();
key = UTF8.String(baos.toByteArray());
baos.close();
baos = null;
}
String filter = event.query.urlMask.pattern().toString();
@ -1037,67 +1039,70 @@ public final class Protocol {
RemoteInstance instance = null;
SolrConnector solrConnector = null;
SolrDocumentList docList = null;
QueryResponse rsp = null;
if (localsearch) {
// search the local index
try {
rsp = event.getQuery().getSegment().fulltext().getDefaultConnector().getResponseByParams(solrQuery);
docList = rsp.getResults();
} catch (final Throwable e) {
Network.log.info("SEARCH failed (solr), localpeer (" + e.getMessage() + ")", e);
return -1;
}
} else {
try {
String address = target == event.peers.mySeed() ? "localhost:" + target.getPort() : target.getPublicAddress();
final int solrtimeout = Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 10000);
instance = new RemoteInstance("http://" + address, null, "solr", solrtimeout); // this is a 'patch configuration' which considers 'solr' as default collection
solrConnector = new RemoteSolrConnector(instance, "solr");
rsp = solrConnector.getResponseByParams(solrQuery);
docList = rsp.getResults();
solrConnector.close();
instance.close();
// no need to close this here because that sends a commit to remote solr which is not wanted here
} catch (final Throwable e) {
Network.log.info("SEARCH failed (solr), remote Peer: " +target.getName() + "/" + target.getPublicAddress() + " (" + e.getMessage() + ")");
return -1;
}
}
// evaluate facets
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(event.query.facetfields.size());
for (String field: event.query.facetfields) {
FacetField facet = rsp.getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet == null ? null : facet.getValues();
if (values == null) continue;
for (Count ff: values) {
int c = (int) ff.getCount();
if (c == 0) continue;
result.set(ff.getName(), c);
}
if (result.size() > 0) facets.put(field, result);
}
// evaluate snippets
Map<String, Map<String, List<String>>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets
Map<String, String> snippets = new HashMap<String, String>(); // this will be a list of urlhash-snippet entries
if (rawsnippets != null) {
nextsnippet: for (Map.Entry<String, Map<String, List<String>>> re: rawsnippets.entrySet()) {
Map<String, List<String>> rs = re.getValue();
for (CollectionSchema field: snippetFields) {
if (rs.containsKey(field.getSolrFieldName())) {
List<String> s = rs.get(field.getSolrFieldName());
if (s.size() > 0) {
snippets.put(re.getKey(), s.get(0));
continue nextsnippet;
{// encapsulate expensive solr QueryResponse object
QueryResponse rsp = null;
if (localsearch) {
// search the local index
try {
rsp = event.getQuery().getSegment().fulltext().getDefaultConnector().getResponseByParams(solrQuery);
docList = rsp.getResults();
} catch (final Throwable e) {
Network.log.info("SEARCH failed (solr), localpeer (" + e.getMessage() + ")", e);
return -1;
}
} else {
try {
String address = target == event.peers.mySeed() ? "localhost:" + target.getPort() : target.getPublicAddress();
final int solrtimeout = Switchboard.getSwitchboard().getConfigInt(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_TIMEOUT, 6000);
instance = new RemoteInstance("http://" + address, null, "solr", solrtimeout); // this is a 'patch configuration' which considers 'solr' as default collection
solrConnector = new RemoteSolrConnector(instance, "solr");
rsp = solrConnector.getResponseByParams(solrQuery);
docList = rsp.getResults();
solrConnector.close();
instance.close();
// no need to close this here because that sends a commit to remote solr which is not wanted here
} catch (final Throwable e) {
Network.log.info("SEARCH failed (solr), remote Peer: " +target.getName() + "/" + target.getPublicAddress() + " (" + e.getMessage() + ")");
return -1;
}
}
// evaluate facets
for (String field: event.query.facetfields) {
FacetField facet = rsp.getFacetField(field);
ReversibleScoreMap<String> result = new ClusteredScoreMap<String>(UTF8.insensitiveUTF8Comparator);
List<Count> values = facet == null ? null : facet.getValues();
if (values == null) continue;
for (Count ff: values) {
int c = (int) ff.getCount();
if (c == 0) continue;
result.set(ff.getName(), c);
}
if (result.size() > 0) facets.put(field, result);
}
// evaluate snippets
Map<String, Map<String, List<String>>> rawsnippets = rsp.getHighlighting(); // a map from the urlhash to a map with key=field and value = list of snippets
if (rawsnippets != null) {
nextsnippet: for (Map.Entry<String, Map<String, List<String>>> re: rawsnippets.entrySet()) {
Map<String, List<String>> rs = re.getValue();
for (CollectionSchema field: snippetFields) {
if (rs.containsKey(field.getSolrFieldName())) {
List<String> s = rs.get(field.getSolrFieldName());
if (s.size() > 0) {
snippets.put(re.getKey(), s.get(0));
continue nextsnippet;
}
}
}
// no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method
}
// no snippet found :( --we don't assign a value here by default; that can be done as an evaluation outside this method
}
rsp = null;
}
// evaluate result
List<URIMetadataNode> container = new ArrayList<URIMetadataNode>();
if (docList == null || docList.size() == 0) {
@ -1164,24 +1169,25 @@ public final class Protocol {
// add the url entry to the word indexes
container.add(urlEntry);
}
final int dls = docList.size();
final int numFound = (int) docList.getNumFound();
docList.clear();
docList = null;
if (localsearch) {
event.addNodes(container, facets, snippets, true, "localpeer", (int) docList.getNumFound());
event.addNodes(container, facets, snippets, true, "localpeer", numFound);
event.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + docList.getNumFound() + " references");
Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + numFound + " references");
} else {
for (SolrInputDocument doc: docs) {
event.query.getSegment().putDocumentInQueue(doc);
}
docs.clear(); docs = null;
event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, (int) docList.getNumFound());
event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, numFound);
event.addFinalize();
event.addExpectedRemoteReferences(-count);
Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (container.size() == 0 ? 0 : container.size()) + "/" + docList.getNumFound() + " references");
Network.log.info("remote search (solr): peer " + target.getName() + " sent " + (container.size() == 0 ? 0 : container.size()) + "/" + numFound + " references");
}
final int dls = docList.size();
docList.clear();
docList = null;
if (solrConnector != null) solrConnector.close();
if (instance != null) instance.close();
return dls;

@ -191,14 +191,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// we did not find everything in the metadata, look further into the document itself.
// first acquire the sentences:
final String solrText = row.getText();
String solrText = row.getText();
if (solrText != null) {
// compute sentences from solr query
final SentenceReader sr = new SentenceReader(solrText, pre);
SentenceReader sr = new SentenceReader(solrText, pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
sr.close();
sr = null;
solrText = null;
} else if (net.yacy.crawler.data.Cache.has(url.hash())) {
// get the sentences from the cache
final Request request = loader == null ? null : loader.request(url, true, reindexing);
@ -213,6 +216,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
sentences = document.getSentences(pre);
response = null;
document = null;
} catch (final Parser.Failure e) {
}
}
@ -254,6 +259,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null);
return;
}
sentences = null; // we don't need this here any more
// try to load the resource from the cache
Response response = null;
@ -311,6 +317,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage());
return;
}
sentences = null;
} //encapsulate potential expensive sentences END
// compute snippet from media - attention document closed above!

Loading…
Cancel
Save