Reduced text snippet extraction processing time.

By not generating MD5 hashes on all words of indexed texts, processing
time is reduced by 30 to 50% on indexed documents with more than 1Mbytes
of plain text.
pull/137/head
luccioman 7 years ago
parent 7525594315
commit e115e57cc7

@ -350,6 +350,7 @@ public class ViewFile {
TextSnippet snippet = new TextSnippet( TextSnippet snippet = new TextSnippet(
sb.loader, sb.loader,
urlEntry, urlEntry,
goal.getIncludeWordsSet(),
goal.getIncludeHashes(), goal.getIncludeHashes(),
CacheStrategy.CACHEONLY, CacheStrategy.CACHEONLY,
false, false,

@ -21,26 +21,24 @@
package net.yacy.document; package net.yacy.document;
import java.util.Collection; import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.index.RowHandleSet;
public class SnippetExtractor { public class SnippetExtractor {
String snippetString; private String snippetString;
HandleSet remainingHashes; private Set<String> remainingTerms;
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null"); if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<byte[], Integer> hs; SortedMap<String, Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>(); final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
long uniqCounter = 999L; long uniqCounter = 999L;
Integer pos; Integer pos;
@ -48,9 +46,9 @@ public class SnippetExtractor {
int linenumber = 0; int linenumber = 0;
int fullmatchcounter = 0; int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) { lookup: for (final StringBuilder sentence: sentences) {
hs = WordTokenizer.hashSentence(sentence.toString(), 100); hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
positions = new TreeSet<Integer>(); positions = new TreeSet<Integer>();
for (final byte[] word: queryhashes) { for (final String word: queryTerms) {
pos = hs.get(word); pos = hs.get(word);
if (pos != null) { if (pos != null) {
positions.add(pos); positions.add(pos);
@ -65,7 +63,7 @@ public class SnippetExtractor {
if (!positions.isEmpty()) { if (!positions.isEmpty()) {
order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence); order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
if (order.size() > 5) order.remove(order.firstEntry().getKey()); if (order.size() > 5) order.remove(order.firstEntry().getKey());
if (positions.size() == queryhashes.size()) fullmatchcounter++; if (positions.size() == queryTerms.size()) fullmatchcounter++;
if (fullmatchcounter >= 3) break lookup; if (fullmatchcounter >= 3) break lookup;
} }
linenumber++; linenumber++;
@ -76,31 +74,31 @@ public class SnippetExtractor {
while (!order.isEmpty()) { while (!order.isEmpty()) {
sentence = order.remove(order.lastKey()); // sentence with the biggest score sentence = order.remove(order.lastKey()); // sentence with the biggest score
try { try {
tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength); tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
continue; continue;
} }
this.snippetString = tsr.snippetString; this.snippetString = tsr.snippetString;
if (this.snippetString != null && this.snippetString.length() > 0) { if (this.snippetString != null && this.snippetString.length() > 0) {
this.remainingHashes = tsr.remainingHashes; this.remainingTerms = tsr.remainingTerms;
if (this.remainingHashes.isEmpty()) { if (this.remainingTerms.isEmpty()) {
// we have found the snippet // we have found the snippet
return; // finished! return; // finished!
} else if (this.remainingHashes.size() < queryhashes.size()) { } else if (this.remainingTerms.size() < queryTerms.size()) {
// the result has not all words in it. // the result has not all words in it.
// find another sentence that represents the missing other words // find another sentence that represents the missing other words
// and find recursively more sentences // and find recursively more sentences
maxLength = maxLength - this.snippetString.length(); maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20; if (maxLength < 20) maxLength = 20;
try { try {
tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength); tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
throw e; throw e;
} }
final String nextSnippet = tsr.snippetString; final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return; if (nextSnippet == null) return;
this.snippetString = this.snippetString + (" / " + nextSnippet); this.snippetString = this.snippetString + (" / " + nextSnippet);
this.remainingHashes = tsr.remainingHashes; this.remainingTerms = tsr.remainingTerms;
return; return;
} else { } else {
// error // error
@ -120,27 +118,24 @@ public class SnippetExtractor {
return 0; return 0;
} }
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
try { try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given"); if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
byte[] hash; String term;
// find all hashes that appear in the sentence // find all hashes that appear in the sentence
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, 100); final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
final Iterator<byte[]> j = queryhashes.iterator(); final Iterator<String> j = queryTerms.iterator();
Integer pos; Integer pos;
int p, minpos = sentence.length(), maxpos = -1; int p, minpos = sentence.length(), maxpos = -1;
final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0); final Set<String> remainingTerms = new HashSet<>();
while (j.hasNext()) { while (j.hasNext()) {
hash = j.next(); term = j.next();
pos = hs.get(hash); pos = hs.get(term);
if (pos == null) { if (pos == null) {
try { remainingTerms.add(term);
remainingHashes.put(hash);
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
}
} else { } else {
p = pos.intValue(); p = pos.intValue();
if (p > maxpos) maxpos = p; if (p > maxpos) maxpos = p;
@ -185,7 +180,7 @@ public class SnippetExtractor {
sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
} }
this.snippetString = sentence; this.snippetString = sentence;
this.remainingHashes = remainingHashes; this.remainingTerms = remainingTerms;
} catch (final IndexOutOfBoundsException e) { } catch (final IndexOutOfBoundsException e) {
throw new UnsupportedOperationException(e.getMessage()); throw new UnsupportedOperationException(e.getMessage());
} }
@ -195,7 +190,7 @@ public class SnippetExtractor {
return this.snippetString; return this.snippetString;
} }
public HandleSet getRemainingWords() { public Set<String> getRemainingTerms() {
return this.remainingHashes; return this.remainingTerms;
} }
} }

@ -27,6 +27,7 @@ package net.yacy.document;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.List; import java.util.List;
import java.util.Locale;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
@ -210,4 +211,34 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
words = null; words = null;
} }
} }
/**
* Tokenize the given sentence and generate a word-wordPos mapping
* @param sentence the sentence to be tokenized
* @return a ordered map containing word as key and position as value. The map is ordered by words.
*/
public static SortedMap<String, Integer> tokenizeSentence(final String sentence, int maxlength) {
final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
try {
int pos = 0;
String word;
Integer oldpos;
while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(word, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(word, oldpos);
}
pos += word.length() + 1;
}
return map;
} finally {
words.close();
words = null;
}
}
} }

@ -251,18 +251,32 @@ public class QueryGoal {
} }
/** /**
* @return a set of words to be included in the search result * @return an iterator on the set of words to be included in the search result
*/ */
public Iterator<String> getIncludeWords() { public Iterator<String> getIncludeWords() {
return this.include_words.iterator(); return this.include_words.iterator();
} }
/**
* @return a copy of the set of words to be included in the search result
*/
public Set<String> getIncludeWordsSet() {
return new NormalizedWords(this.include_words);
}
/** /**
* @return a set of words to be excluded in the search result * @return an iterator on the set of words to be excluded from the search result
*/ */
public Iterator<String> getExcludeWords() { public Iterator<String> getExcludeWords() {
return this.exclude_words.iterator(); return this.exclude_words.iterator();
} }
/**
* @return a copy of the set of words to be excluded from the search result
*/
public Set<String> getExcludeWordsSet() {
return new NormalizedWords(this.exclude_words);
}
/** /**
* @return a list of include strings which reproduces the original order of the search words and quotation * @return a list of include strings which reproduces the original order of the search words and quotation

@ -183,6 +183,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
/** a set of word hashes that are used to match with the snippets */ /** a set of word hashes that are used to match with the snippets */
private final HandleSet snippetFetchWordHashes; private final HandleSet snippetFetchWordHashes;
/** a set of words that are used to match with the snippets */
private final Set<String> snippetFetchWords;
private final boolean deleteIfSnippetFail; private final boolean deleteIfSnippetFail;
private long urlRetrievalAllTime; private long urlRetrievalAllTime;
private long snippetComputationAllTime; private long snippetComputationAllTime;
@ -531,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
// snippets do not need to match with the complete query hashes, // snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search // only with the query minus the stopwords which had not been used for the search
boolean filtered = false; boolean filtered = false;
// check if query contains stopword // check if query contains stopword
if (Switchboard.stopwordHashes != null) { if (Switchboard.stopwordHashes != null) {
@ -547,6 +549,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
if (filtered) { // remove stopwords if (filtered) { // remove stopwords
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes); this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
} }
this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet();
// remove stopwords
this.snippetFetchWords.removeAll(Switchboard.stopwords);
// clean up events // clean up events
SearchEventCache.cleanupEvents(false); SearchEventCache.cleanupEvents(false);
@ -1877,6 +1883,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, ""); final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, "");
final TextSnippet yacysnippet = new TextSnippet(this.loader, final TextSnippet yacysnippet = new TextSnippet(this.loader,
node, node,
this.query.getQueryGoal().getIncludeWordsSet(),
this.query.getQueryGoal().getIncludeHashes(), this.query.getQueryGoal().getIncludeHashes(),
CacheStrategy.CACHEONLY, CacheStrategy.CACHEONLY,
false, false,
@ -2000,6 +2007,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet snippet = new TextSnippet( final TextSnippet snippet = new TextSnippet(
null, null,
page, page,
this.snippetFetchWords,
this.snippetFetchWordHashes, this.snippetFetchWordHashes,
null, null,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))), ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2016,6 +2024,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
final TextSnippet snippet = new TextSnippet( final TextSnippet snippet = new TextSnippet(
this.loader, this.loader,
page, page,
this.snippetFetchWords,
this.snippetFetchWordHashes, this.snippetFetchWordHashes,
cacheStrategy, cacheStrategy,
((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))), ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2032,7 +2041,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
} else { } else {
// problems with snippet fetch // problems with snippet fetch
if (this.snippetFetchWordHashes.has(Segment.catchallHash)) { if (this.snippetFetchWords.contains(Segment.catchallString)) {
// we accept that because the word cannot be on the page // we accept that because the word cannot be on the page
return page.makeResultEntry(this.query.getSegment(), this.peers, null); return page.makeResultEntry(this.query.getSegment(), this.peers, null);
} }

@ -126,7 +126,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
SOURCE_WEB(false), SOURCE_WEB(false),
/** Snippet computed by YaCy from document metadata */ /** Snippet computed by YaCy from document metadata */
SOURCE_METADATA(false), SOURCE_METADATA(false),
ERROR_NO_HASH_GIVEN(true), /** Could not extract a snippet because no search term was provided */
ERROR_NO_TERM_GIVEN(true),
ERROR_SOURCE_LOADING(true), ERROR_SOURCE_LOADING(true),
ERROR_RESOURCE_LOADING(true), ERROR_RESOURCE_LOADING(true),
ERROR_PARSER_FAILED(true), ERROR_PARSER_FAILED(true),
@ -166,6 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
public TextSnippet( public TextSnippet(
final LoaderDispatcher loader, final LoaderDispatcher loader,
final URIMetadataNode row, final URIMetadataNode row,
final Set<String> queryTerms,
final HandleSet queryhashes, final HandleSet queryhashes,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final boolean pre, final boolean pre,
@ -175,36 +177,40 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// heise = "0OQUNU3JSs05" // heise = "0OQUNU3JSs05"
final DigestURL url = row.url(); final DigestURL url = row.url();
if (queryhashes.isEmpty()) { if (queryTerms.isEmpty()) {
//System.out.println("found no queryhashes for URL retrieve " + url); init(url, null, false, ResultClass.ERROR_NO_TERM_GIVEN, "no query terms given", beginTime);
init(url, null, false, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given", beginTime);
return; return;
} }
// try to get snippet from snippetCache // try to get snippet from snippetCache
ResultClass source = ResultClass.SOURCE_CACHE; ResultClass source = ResultClass.SOURCE_CACHE;
final String wordhashes = RemoteSearch.set2string(queryhashes); final String urlHash = ASCII.String(url.hash());
final String urls = ASCII.String(url.hash()); final String wordhashes;
final String snippetLine = snippetsCache.get(wordhashes, urls); if(queryhashes != null) {
if (snippetLine != null) { wordhashes = RemoteSearch.set2string(queryhashes);
// found the snippet final String snippetLine = snippetsCache.get(wordhashes, urlHash);
init(url, snippetLine, false, source, null, beginTime); if (snippetLine != null) {
return; // found the snippet
init(url, snippetLine, false, source, null, beginTime);
return;
}
} else {
wordhashes = null;
} }
// try to get the snippet from a document at the cache (or in the web) // try to get the snippet from a document at the cache (or in the web)
// this requires that the document is parsed after loading // this requires that the document is parsed after loading
String textline = null; String textline = null;
HandleSet remainingHashes = queryhashes.clone(); Set<String> remainingTerms = new HashSet<>(queryTerms);
List<StringBuilder> sentences = null; List<StringBuilder> sentences = null;
// try to get the snippet from metadata // try to get the snippet from metadata
removeMatchingHashes(row.url().toTokens(), remainingHashes); removeMatchingTerms(row.url().toTokens(), remainingTerms);
removeMatchingHashes(row.dc_title(), remainingHashes); removeMatchingTerms(row.dc_title(), remainingTerms);
removeMatchingHashes(row.dc_creator(), remainingHashes); removeMatchingTerms(row.dc_creator(), remainingTerms);
removeMatchingHashes(row.dc_subject(), remainingHashes); removeMatchingTerms(row.dc_subject(), remainingTerms);
if (!remainingHashes.isEmpty()) { if (!remainingTerms.isEmpty()) {
// we did not find everything in the metadata, look further into the document itself. // we did not find everything in the metadata, look further into the document itself.
// first acquire the sentences (from description/abstract or text): // first acquire the sentences (from description/abstract or text):
@ -245,9 +251,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (sentences.size() > 0) { if (sentences.size() > 0) {
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet(); textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords(); remainingTerms = tsr.getRemainingTerms();
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime); init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
return; return;
@ -255,7 +261,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
} }
if (remainingHashes.isEmpty()) { if (remainingTerms.isEmpty()) {
// we found the snippet or the query is fully included in the headline or url // we found the snippet or the query is fully included in the headline or url
if (textline == null || textline.length() == 0) { if (textline == null || textline.length() == 0) {
// this is the case where we don't have a snippet because all search words are included in the headline or the url // this is the case where we don't have a snippet because all search words are included in the headline or the url
@ -347,23 +353,25 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet(); textline = tsr.getSnippet();
remainingHashes = tsr.getRemainingWords(); remainingTerms = tsr.getRemainingTerms();
} catch (final UnsupportedOperationException e) { } catch (final UnsupportedOperationException e) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime); init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
return; return;
} }
sentences = null; sentences = null;
if (textline == null || !remainingHashes.isEmpty()) { if (textline == null || !remainingTerms.isEmpty()) {
init(url, null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found", beginTime); init(url, null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found", beginTime);
return; return;
} }
if (textline.length() > snippetMaxLength) textline = textline.substring(0, snippetMaxLength); if (textline.length() > snippetMaxLength) textline = textline.substring(0, snippetMaxLength);
// finally store this snippet in our own cache // finally store this snippet in our own cache
snippetsCache.put(wordhashes, urls, textline); if(wordhashes != null) {
snippetsCache.put(wordhashes, urlHash, textline);
}
init(url, textline, false, source, null, beginTime); init(url, textline, false, source, null, beginTime);
} }
@ -589,17 +597,18 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
CharacterCoding.unicode2html(prefix.toString(), false)); CharacterCoding.unicode2html(prefix.toString(), false));
theWord.append(CharacterCoding.unicode2html(postfix.toString(), false)); theWord.append(CharacterCoding.unicode2html(postfix.toString(), false));
return theWord.toString(); return theWord.toString();
} }
private static void removeMatchingHashes(final String sentence, final HandleSet queryhashes) { /**
if (queryhashes.size() == 0) return; * Modify the queryTerms set : remove terms present in the given sentence.
final Set<byte[]> m = WordTokenizer.hashSentence(sentence, 100).keySet(); * @param sentence a sentence potentially matching some terms of queryTerms
//for (byte[] b: m) System.out.println("sentence hash: " + ASCII.String(b)); * @param queryTerms a set of normalized terms
//for (byte[] b: queryhashes) System.out.println("queryhash: " + ASCII.String(b)); */
ArrayList<byte[]> o = new ArrayList<byte[]>(queryhashes.size()); private static void removeMatchingTerms(final String sentence, final Set<String> queryTerms) {
for (final byte[] b : queryhashes) { if (queryTerms.size() == 0) {
if (m.contains(b)) o.add(b); return;
} }
for (final byte[] b : o) queryhashes.remove(b); final Set<String> sentenceWords = WordTokenizer.tokenizeSentence(sentence, 100).keySet();
queryTerms.removeAll(sentenceWords);
} }
} }

@ -1,20 +1,31 @@
package net.yacy.search.snippet; package net.yacy.search.snippet;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import org.apache.solr.common.SolrDocument;
import org.junit.Before;
import org.junit.Test;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.query.QueryGoal; import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;
public class TextSnippetTest { public class TextSnippetTest {
@ -55,12 +66,12 @@ public class TextSnippetTest {
String querywords = "testcase line"; String querywords = "testcase line";
QueryGoal qg = new QueryGoal(querywords); QueryGoal qg = new QueryGoal(querywords);
HandleSet queryhashes = qg.getIncludeHashes();
TextSnippet ts = new TextSnippet( TextSnippet ts = new TextSnippet(
null, null,
testpage, testpage,
queryhashes, qg.getIncludeWordsSet(),
qg.getIncludeHashes(),
cacheStrategy, cacheStrategy,
pre, pre,
snippetMaxLength, snippetMaxLength,
@ -95,12 +106,12 @@ public class TextSnippetTest {
String querywords = "testcase line"; String querywords = "testcase line";
QueryGoal qg = new QueryGoal(querywords); QueryGoal qg = new QueryGoal(querywords);
HandleSet queryhashes = qg.getIncludeHashes();
TextSnippet ts = new TextSnippet( TextSnippet ts = new TextSnippet(
null, null,
testpage, testpage,
queryhashes, qg.getIncludeWordsSet(),
qg.getIncludeHashes(),
cacheStrategy, cacheStrategy,
pre, pre,
snippetMaxLength, snippetMaxLength,
@ -166,4 +177,65 @@ public class TextSnippetTest {
assertTrue ("number (.) broken up",sniptxt.contains("1.83")); assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
assertTrue ("number (,) broken up",sniptxt.contains("3,14")); assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
} }
/**
* Run text snippet extraction from a given plain text file.
* @param args <ol><li>first element : the plain text file path. When not specified, "test/parsertest/umlaute_linux.txt" is used as default.</li>
* <li>other elements : the search terms. When not specified, "Maßkrügen" is used as default</li>
* </ol>
* @throws IOException when a read/write error occurred
*/
public static void main(final String args[]) throws IOException {
try {
final SolrDocument doc = new SolrDocument();
final DigestURL url = new DigestURL("http://localhost/page.html");
doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
final URIMetadataNode urlEntry = new URIMetadataNode(doc);
urlEntry.addField(CollectionSchema.title.name(), "New test case");
urlEntry.addField(CollectionSchema.keywords.name(), "junit");
urlEntry.addField(CollectionSchema.author.name(), "test author");
final Path testFilePath;
if(args.length > 0) {
testFilePath = Paths.get(args[0]);
} else {
testFilePath = Paths.get("test/parsertest/umlaute_linux.txt");
}
urlEntry.addField(CollectionSchema.text_t.name(), new String(Files.readAllBytes(testFilePath),
StandardCharsets.UTF_8));
final StringBuilder queryWords = new StringBuilder();
if(args.length > 1) {
for(int i = 1; i < args.length; i++) {
if(queryWords.length() > 0) {
queryWords.append(" ");
}
queryWords.append(args[i]);
}
} else {
queryWords.append("Maßkrügen");
}
final QueryGoal goal = new QueryGoal(queryWords.toString());
System.out.println("Extracting text snippet for terms \"" + queryWords + "\" from file " + testFilePath);
TextSnippet.statistics.setEnabled(true);
final TextSnippet snippet = new TextSnippet(null, urlEntry, goal.getIncludeWordsSet(), goal.getIncludeHashes(),
CacheStrategy.CACHEONLY, false, SearchEvent.SNIPPET_MAX_LENGTH, false);
System.out.println("Snippet initialized in " + TextSnippet.statistics.getMaxInitTime() + "ms");
System.out.println("Snippet status : " + snippet.getErrorCode());
System.out.println("Snippet : " + snippet.descriptionline(goal));
} finally {
/* Shutdown running threads */
try {
Domains.close();
} finally {
ConcurrentLog.shutdown();
}
}
}
} }

Loading…
Cancel
Save