Reduced text snippet extraction processing time.

By not generating MD5 hashes on all words of indexed texts, processing
time is reduced by 30 to 50% on indexed documents with more than 1Mbytes
of plain text.
pull/137/head
luccioman 7 years ago
parent 7525594315
commit e115e57cc7

@ -350,6 +350,7 @@ public class ViewFile {
TextSnippet snippet = new TextSnippet(
sb.loader,
urlEntry,
goal.getIncludeWordsSet(),
goal.getIncludeHashes(),
CacheStrategy.CACHEONLY,
false,

@ -21,26 +21,24 @@
package net.yacy.document;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.index.RowHandleSet;
public class SnippetExtractor {
String snippetString;
HandleSet remainingHashes;
private String snippetString;
private Set<String> remainingTerms;
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
SortedMap<byte[], Integer> hs;
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<String, Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L;
Integer pos;
@ -48,9 +46,9 @@ public class SnippetExtractor {
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) {
hs = WordTokenizer.hashSentence(sentence.toString(), 100);
hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
positions = new TreeSet<Integer>();
for (final byte[] word: queryhashes) {
for (final String word: queryTerms) {
pos = hs.get(word);
if (pos != null) {
positions.add(pos);
@ -65,7 +63,7 @@ public class SnippetExtractor {
if (!positions.isEmpty()) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey());
if (positions.size() == queryhashes.size()) fullmatchcounter++;
if (positions.size() == queryTerms.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup;
}
linenumber++;
@ -76,31 +74,31 @@ public class SnippetExtractor {
while (!order.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score
try {
tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
} catch (final UnsupportedOperationException e) {
continue;
}
this.snippetString = tsr.snippetString;
if (this.snippetString != null && this.snippetString.length() > 0) {
this.remainingHashes = tsr.remainingHashes;
if (this.remainingHashes.isEmpty()) {
this.remainingTerms = tsr.remainingTerms;
if (this.remainingTerms.isEmpty()) {
// we have found the snippet
return; // finished!
} else if (this.remainingHashes.size() < queryhashes.size()) {
} else if (this.remainingTerms.size() < queryTerms.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
} catch (final UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
this.snippetString = this.snippetString + (" / " + nextSnippet);
this.remainingHashes = tsr.remainingHashes;
this.remainingTerms = tsr.remainingTerms;
return;
} else {
// error
@ -120,27 +118,24 @@ public class SnippetExtractor {
return 0;
}
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
String term;
// find all hashes that appear in the sentence
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, 100);
final Iterator<byte[]> j = queryhashes.iterator();
final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
final Iterator<String> j = queryTerms.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0);
final Set<String> remainingTerms = new HashSet<>();
while (j.hasNext()) {
hash = j.next();
pos = hs.get(hash);
term = j.next();
pos = hs.get(term);
if (pos == null) {
try {
remainingHashes.put(hash);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
remainingTerms.add(term);
} else {
p = pos.intValue();
if (p > maxpos) maxpos = p;
@ -185,7 +180,7 @@ public class SnippetExtractor {
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
this.snippetString = sentence;
this.remainingHashes = remainingHashes;
this.remainingTerms = remainingTerms;
} catch (final IndexOutOfBoundsException e) {
throw new UnsupportedOperationException(e.getMessage());
}
@ -195,7 +190,7 @@ public class SnippetExtractor {
return this.snippetString;
}
public HandleSet getRemainingWords() {
return this.remainingHashes;
}
public Set<String> getRemainingTerms() {
return this.remainingTerms;
}
}

@ -27,6 +27,7 @@ package net.yacy.document;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.Locale;
import java.util.SortedMap;
import java.util.TreeMap;
@ -210,4 +211,34 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
words = null;
}
}
/**
* Tokenize the given sentence and generate a word-wordPos mapping
* @param sentence the sentence to be tokenized
* @return a ordered map containing word as key and position as value. The map is ordered by words.
*/
public static SortedMap<String, Integer> tokenizeSentence(final String sentence, int maxlength) {
final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
try {
int pos = 0;
String word;
Integer oldpos;
while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(word, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(word, oldpos);
}
pos += word.length() + 1;
}
return map;
} finally {
words.close();
words = null;
}
}
}

@ -251,18 +251,32 @@ public class QueryGoal {
}
/**
* @return a set of words to be included in the search result
* @return an iterator on the set of words to be included in the search result
*/
public Iterator<String> getIncludeWords() {
return this.include_words.iterator();
}
/**
* @return a copy of the set of words to be included in the search result
*/
public Set<String> getIncludeWordsSet() {
return new NormalizedWords(this.include_words);
}
/**
* @return a set of words to be excluded in the search result
* @return an iterator on the set of words to be excluded from the search result
*/
public Iterator<String> getExcludeWords() {
return this.exclude_words.iterator();
}
/**
* @return a copy of the set of words to be excluded from the search result
*/
public Set<String> getExcludeWordsSet() {
return new NormalizedWords(this.exclude_words);
}
/**
* @return a list of include strings which reproduces the original order of the search words and quotation

@ -183,6 +183,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
/** a set of word hashes that are used to match with the snippets */
private final HandleSet snippetFetchWordHashes;
/** a set of words that are used to match with the snippets */
private final Set<String> snippetFetchWords;
private final boolean deleteIfSnippetFail;
private long urlRetrievalAllTime;
private long snippetComputationAllTime;
@ -531,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
// only with the query minus the stopwords which had not been used for the search
boolean filtered = false;
// check if query contains stopword
if (Switchboard.stopwordHashes != null) {
@ -547,6 +549,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
if (filtered) { // remove stopwords
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
}
this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet();
// remove stopwords
this.snippetFetchWords.removeAll(Switchboard.stopwords);
// clean up events
SearchEventCache.cleanupEvents(false);
@ -1877,6 +1883,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, "");
final TextSnippet yacysnippet = new TextSnippet(this.loader,
node,
this.query.getQueryGoal().getIncludeWordsSet(),
this.query.getQueryGoal().getIncludeHashes(),
CacheStrategy.CACHEONLY,
false,
@ -2000,6 +2007,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet snippet = new TextSnippet(
null,
page,
this.snippetFetchWords,
this.snippetFetchWordHashes,
null,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2016,6 +2024,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet snippet = new TextSnippet(
this.loader,
page,
this.snippetFetchWords,
this.snippetFetchWordHashes,
cacheStrategy,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2032,7 +2041,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
} else {
// problems with snippet fetch
if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
if (this.snippetFetchWords.contains(Segment.catchallString)) {
// we accept that because the word cannot be on the page
return page.makeResultEntry(this.query.getSegment(), this.peers, null);
}

@ -126,7 +126,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
SOURCE_WEB(false),
/** Snippet computed by YaCy from document metadata */
SOURCE_METADATA(false),
ERROR_NO_HASH_GIVEN(true),
/** Could not extract a snippet because no search term was provided */
ERROR_NO_TERM_GIVEN(true),
ERROR_SOURCE_LOADING(true),
ERROR_RESOURCE_LOADING(true),
ERROR_PARSER_FAILED(true),
@ -166,6 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet(
final LoaderDispatcher loader,
final URIMetadataNode row,
final Set<String> queryTerms,
final HandleSet queryhashes,
final CacheStrategy cacheStrategy,
final boolean pre,
@ -175,36 +177,40 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// heise = "0OQUNU3JSs05"
final DigestURL url = row.url();
if (queryhashes.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url);
init(url, null, false, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given", beginTime);
if (queryTerms.isEmpty()) {
init(url, null, false, ResultClass.ERROR_NO_TERM_GIVEN, "no query terms given", beginTime);
return;
}
// try to get snippet from snippetCache
ResultClass source = ResultClass.SOURCE_CACHE;
final String wordhashes = RemoteSearch.set2string(queryhashes);
final String urls = ASCII.String(url.hash());
final String snippetLine = snippetsCache.get(wordhashes, urls);
if (snippetLine != null) {
// found the snippet
init(url, snippetLine, false, source, null, beginTime);
return;
final String urlHash = ASCII.String(url.hash());
final String wordhashes;
if(queryhashes != null) {
wordhashes = RemoteSearch.set2string(queryhashes);
final String snippetLine = snippetsCache.get(wordhashes, urlHash);
if (snippetLine != null) {
// found the snippet
init(url, snippetLine, false, source, null, beginTime);
return;
}
} else {
wordhashes = null;
}
// try to get the snippet from a document at the cache (or in the web)
// this requires that the document is parsed after loading
String textline = null;
HandleSet remainingHashes = queryhashes.clone();
Set<String> remainingTerms = new HashSet<>(queryTerms);
List<StringBuilder> sentences = null;
// try to get the snippet from metadata
removeMatchingHashes(row.url().toTokens(), remainingHashes);
removeMatchingHashes(row.dc_title(), remainingHashes);
removeMatchingHashes(row.dc_creator(), remainingHashes);
removeMatchingHashes(row.dc_subject(), remainingHashes);
removeMatchingTerms(row.url().toTokens(), remainingTerms);
removeMatchingTerms(row.dc_title(), remainingTerms);
removeMatchingTerms(row.dc_creator(), remainingTerms);
removeMatchingTerms(row.dc_subject(), remainingTerms);
if (!remainingHashes.isEmpty()) {
if (!remainingTerms.isEmpty()) {
// we did not find everything in the metadata, look further into the document itself.
// first acquire the sentences (from description/abstract or text):
@ -245,9 +251,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (sentences.size() > 0) {
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
remainingTerms = tsr.getRemainingTerms();
} catch (final UnsupportedOperationException e) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
return;
@ -255,7 +261,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
}
if (remainingHashes.isEmpty()) {
if (remainingTerms.isEmpty()) {
// we found the snippet or the query is fully included in the headline or url
if (textline == null || textline.length() == 0) {
// this is the case where we don't have a snippet because all search words are included in the headline or the url
@ -347,23 +353,25 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords();
remainingTerms = tsr.getRemainingTerms();
} catch (final UnsupportedOperationException e) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
return;
}
sentences = null;
if (textline == null || !remainingHashes.isEmpty()) {
if (textline == null || !remainingTerms.isEmpty()) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found", beginTime);
return;
}
if (textline.length() > snippetMaxLength) textline = textline.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
snippetsCache.put(wordhashes, urls, textline);
if(wordhashes != null) {
snippetsCache.put(wordhashes, urlHash, textline);
}
init(url, textline, false, source, null, beginTime);
}
@ -589,17 +597,18 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
CharacterCoding.unicode2html(prefix.toString(), false));
theWord.append(CharacterCoding.unicode2html(postfix.toString(), false));
return theWord.toString();
}
private static void removeMatchingHashes(final String sentence, final HandleSet queryhashes) {
if (queryhashes.size() == 0) return;
final Set<byte[]> m = WordTokenizer.hashSentence(sentence, 100).keySet();
//for (byte[] b: m) System.out.println("sentence hash: " + ASCII.String(b));
//for (byte[] b: queryhashes) System.out.println("queryhash: " + ASCII.String(b));
ArrayList<byte[]> o = new ArrayList<byte[]>(queryhashes.size());
for (final byte[] b : queryhashes) {
if (m.contains(b)) o.add(b);
}
/**
* Modify the queryTerms set : remove terms present in the given sentence.
* @param sentence a sentence potentially matching some terms of queryTerms
* @param queryTerms a set of normalized terms
*/
private static void removeMatchingTerms(final String sentence, final Set<String> queryTerms) {
if (queryTerms.size() == 0) {
return;
}
for (final byte[] b : o) queryhashes.remove(b);
final Set<String> sentenceWords = WordTokenizer.tokenizeSentence(sentence, 100).keySet();
queryTerms.removeAll(sentenceWords);
}
}

@ -1,20 +1,31 @@
package net.yacy.search.snippet;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.solr.common.SolrDocument;
import org.junit.Before;
import org.junit.Test;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;
public class TextSnippetTest {
@ -55,12 +66,12 @@ public class TextSnippetTest {
String querywords = "testcase line";
QueryGoal qg = new QueryGoal(querywords);
HandleSet queryhashes = qg.getIncludeHashes();
TextSnippet ts = new TextSnippet(
null,
testpage,
queryhashes,
qg.getIncludeWordsSet(),
qg.getIncludeHashes(),
cacheStrategy,
pre,
snippetMaxLength,
@ -95,12 +106,12 @@ public class TextSnippetTest {
String querywords = "testcase line";
QueryGoal qg = new QueryGoal(querywords);
HandleSet queryhashes = qg.getIncludeHashes();
TextSnippet ts = new TextSnippet(
null,
testpage,
queryhashes,
qg.getIncludeWordsSet(),
qg.getIncludeHashes(),
cacheStrategy,
pre,
snippetMaxLength,
@ -166,4 +177,65 @@ public class TextSnippetTest {
assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
}
/**
* Run text snippet extraction from a given plain text file.
* @param args <ol><li>first element : the plain text file path. When not specified, "test/parsertest/umlaute_linux.txt" is used as default.</li>
* <li>other elements : the search terms. When not specified, "Maßkrügen" is used as default</li>
* </ol>
* @throws IOException when a read/write error occurred
*/
public static void main(final String args[]) throws IOException {
try {
final SolrDocument doc = new SolrDocument();
final DigestURL url = new DigestURL("http://localhost/page.html");
doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
final URIMetadataNode urlEntry = new URIMetadataNode(doc);
urlEntry.addField(CollectionSchema.title.name(), "New test case");
urlEntry.addField(CollectionSchema.keywords.name(), "junit");
urlEntry.addField(CollectionSchema.author.name(), "test author");
final Path testFilePath;
if(args.length > 0) {
testFilePath = Paths.get(args[0]);
} else {
testFilePath = Paths.get("test/parsertest/umlaute_linux.txt");
}
urlEntry.addField(CollectionSchema.text_t.name(), new String(Files.readAllBytes(testFilePath),
StandardCharsets.UTF_8));
final StringBuilder queryWords = new StringBuilder();
if(args.length > 1) {
for(int i = 1; i < args.length; i++) {
if(queryWords.length() > 0) {
queryWords.append(" ");
}
queryWords.append(args[i]);
}
} else {
queryWords.append("Maßkrügen");
}
final QueryGoal goal = new QueryGoal(queryWords.toString());
System.out.println("Extracting text snippet for terms \"" + queryWords + "\" from file " + testFilePath);
TextSnippet.statistics.setEnabled(true);
final TextSnippet snippet = new TextSnippet(null, urlEntry, goal.getIncludeWordsSet(), goal.getIncludeHashes(),
CacheStrategy.CACHEONLY, false, SearchEvent.SNIPPET_MAX_LENGTH, false);
System.out.println("Snippet initialized in " + TextSnippet.statistics.getMaxInitTime() + "ms");
System.out.println("Snippet status : " + snippet.getErrorCode());
System.out.println("Snippet : " + snippet.descriptionline(goal));
} finally {
/* Shutdown running threads */
try {
Domains.close();
} finally {
ConcurrentLog.shutdown();
}
}
}
}

Loading…
Cancel
Save