From e115e57cc7a3bb34406656a701f33f013380691c Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 11 May 2018 15:42:53 +0200 Subject: [PATCH] Reduced text snippet extraction processing time. By not generating MD5 hashes on all words of indexed texts, processing time is reduced by 30 to 50% on indexed documents with more than 1Mbytes of plain text. --- htroot/ViewFile.java | 1 + .../net/yacy/document/SnippetExtractor.java | 67 +++++++------- source/net/yacy/document/WordTokenizer.java | 31 +++++++ source/net/yacy/search/query/QueryGoal.java | 18 +++- source/net/yacy/search/query/SearchEvent.java | 13 ++- .../net/yacy/search/snippet/TextSnippet.java | 79 ++++++++-------- .../yacy/search/snippet/TextSnippetTest.java | 90 +++++++++++++++++-- 7 files changed, 215 insertions(+), 84 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index d99ada761..60cc6cbea 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -350,6 +350,7 @@ public class ViewFile { TextSnippet snippet = new TextSnippet( sb.loader, urlEntry, + goal.getIncludeWordsSet(), goal.getIncludeHashes(), CacheStrategy.CACHEONLY, false, diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java index 803fa35bf..4302b4cdd 100644 --- a/source/net/yacy/document/SnippetExtractor.java +++ b/source/net/yacy/document/SnippetExtractor.java @@ -21,26 +21,24 @@ package net.yacy.document; import java.util.Collection; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; -import net.yacy.cora.storage.HandleSet; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.kelondro.index.RowHandleSet; - public class SnippetExtractor { - String snippetString; - HandleSet remainingHashes; + private String snippetString; + private Set remainingTerms; - public SnippetExtractor(final Collection sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException { + + public SnippetExtractor(final Collection sentences, final Set queryTerms, int maxLength) throws UnsupportedOperationException { if (sentences == null) throw new UnsupportedOperationException("sentence == null"); - if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); - SortedMap hs; + if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null"); + SortedMap hs; final TreeMap order = new TreeMap(); long uniqCounter = 999L; Integer pos; @@ -48,9 +46,9 @@ public class SnippetExtractor { int linenumber = 0; int fullmatchcounter = 0; lookup: for (final StringBuilder sentence: sentences) { - hs = WordTokenizer.hashSentence(sentence.toString(), 100); + hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100); positions = new TreeSet(); - for (final byte[] word: queryhashes) { + for (final String word: queryTerms) { pos = hs.get(word); if (pos != null) { positions.add(pos); @@ -65,7 +63,7 @@ public class SnippetExtractor { if (!positions.isEmpty()) { order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence); if (order.size() > 5) order.remove(order.firstEntry().getKey()); - if (positions.size() == queryhashes.size()) fullmatchcounter++; + if (positions.size() == queryTerms.size()) fullmatchcounter++; if (fullmatchcounter >= 3) break lookup; } linenumber++; @@ -76,31 +74,31 @@ public class SnippetExtractor { while (!order.isEmpty()) { sentence = order.remove(order.lastKey()); // sentence with the biggest score try { - tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength); + tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength); } catch (final UnsupportedOperationException e) { continue; } this.snippetString = tsr.snippetString; if (this.snippetString != null && this.snippetString.length() > 0) { - this.remainingHashes = tsr.remainingHashes; - if (this.remainingHashes.isEmpty()) { + this.remainingTerms = tsr.remainingTerms; + if (this.remainingTerms.isEmpty()) { // we have found the snippet return; // finished! - } else if (this.remainingHashes.size() < queryhashes.size()) { + } else if (this.remainingTerms.size() < queryTerms.size()) { // the result has not all words in it. // find another sentence that represents the missing other words // and find recursively more sentences maxLength = maxLength - this.snippetString.length(); if (maxLength < 20) maxLength = 20; try { - tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength); + tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength); } catch (final UnsupportedOperationException e) { throw e; } final String nextSnippet = tsr.snippetString; if (nextSnippet == null) return; this.snippetString = this.snippetString + (" / " + nextSnippet); - this.remainingHashes = tsr.remainingHashes; + this.remainingTerms = tsr.remainingTerms; return; } else { // error @@ -120,27 +118,24 @@ public class SnippetExtractor { return 0; } - private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException { + + private SnippetExtractor(String sentence, final Set queryTerms, final int maxLength) throws UnsupportedOperationException { try { if (sentence == null) throw new UnsupportedOperationException("no sentence given"); - if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null"); - byte[] hash; + if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null"); + String term; // find all hashes that appear in the sentence - final Map hs = WordTokenizer.hashSentence(sentence, 100); - final Iterator j = queryhashes.iterator(); + final Map hs = WordTokenizer.tokenizeSentence(sentence, 100); + final Iterator j = queryTerms.iterator(); Integer pos; int p, minpos = sentence.length(), maxpos = -1; - final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0); + final Set remainingTerms = new HashSet<>(); while (j.hasNext()) { - hash = j.next(); - pos = hs.get(hash); + term = j.next(); + pos = hs.get(term); if (pos == null) { - try { - remainingHashes.put(hash); - } catch (final SpaceExceededException e) { - ConcurrentLog.logException(e); - } + remainingTerms.add(term); } else { p = pos.intValue(); if (p > maxpos) maxpos = p; @@ -185,7 +180,7 @@ public class SnippetExtractor { sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim(); } this.snippetString = sentence; - this.remainingHashes = remainingHashes; + this.remainingTerms = remainingTerms; } catch (final IndexOutOfBoundsException e) { throw new UnsupportedOperationException(e.getMessage()); } @@ -195,7 +190,7 @@ public class SnippetExtractor { return this.snippetString; } - public HandleSet getRemainingWords() { - return this.remainingHashes; - } + public Set getRemainingTerms() { + return this.remainingTerms; + } } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 25caf88ac..0ed51479f 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -27,6 +27,7 @@ package net.yacy.document; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; +import java.util.Locale; import java.util.SortedMap; import java.util.TreeMap; @@ -210,4 +211,34 @@ public class WordTokenizer implements Enumeration { words = null; } } + + /** + * Tokenize the given sentence and generate a word-wordPos mapping + * @param sentence the sentence to be tokenized + * @return a ordered map containing word as key and position as value. The map is ordered by words. + */ + public static SortedMap tokenizeSentence(final String sentence, int maxlength) { + final SortedMap map = new TreeMap(); + WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null); + try { + int pos = 0; + String word; + Integer oldpos; + while (words.hasMoreElements() && maxlength-- > 0) { + word = words.nextElement().toString().toLowerCase(Locale.ENGLISH); + + // don't overwrite old values, that leads to too far word distances + oldpos = map.put(word, LargeNumberCache.valueOf(pos)); + if (oldpos != null) { + map.put(word, oldpos); + } + + pos += word.length() + 1; + } + return map; + } finally { + words.close(); + words = null; + } + } } diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 9c94540fa..eb47f9ce5 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -251,18 +251,32 @@ public class QueryGoal { } /** - * @return a set of words to be included in the search result + * @return an iterator on the set of words to be included in the search result */ public Iterator getIncludeWords() { return this.include_words.iterator(); } + + /** + * @return a copy of the set of words to be included in the search result + */ + public Set getIncludeWordsSet() { + return new NormalizedWords(this.include_words); + } /** - * @return a set of words to be excluded in the search result + * @return an iterator on the set of words to be excluded from the search result */ public Iterator getExcludeWords() { return this.exclude_words.iterator(); } + + /** + * @return a copy of the set of words to be excluded from the search result + */ + public Set getExcludeWordsSet() { + return new NormalizedWords(this.exclude_words); + } /** * @return a list of include strings which reproduces the original order of the search words and quotation diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 49fe0ede4..7b3895d9b 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -183,6 +183,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener { /** a set of word hashes that are used to match with the snippets */ private final HandleSet snippetFetchWordHashes; + /** a set of words that are used to match with the snippets */ + private final Set snippetFetchWords; private final boolean deleteIfSnippetFail; private long urlRetrievalAllTime; private long snippetComputationAllTime; @@ -531,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { this.resultList = new WeakPriorityBlockingQueue(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking // snippets do not need to match with the complete query hashes, - // only with the query minus the stopwords which had not been used for the search + // only with the query minus the stopwords which had not been used for the search boolean filtered = false; // check if query contains stopword if (Switchboard.stopwordHashes != null) { @@ -547,6 +549,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener { if (filtered) { // remove stopwords this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes); } + + this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet(); + // remove stopwords + this.snippetFetchWords.removeAll(Switchboard.stopwords); // clean up events SearchEventCache.cleanupEvents(false); @@ -1877,6 +1883,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, ""); final TextSnippet yacysnippet = new TextSnippet(this.loader, node, + this.query.getQueryGoal().getIncludeWordsSet(), this.query.getQueryGoal().getIncludeHashes(), CacheStrategy.CACHEONLY, false, @@ -2000,6 +2007,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { final TextSnippet snippet = new TextSnippet( null, page, + this.snippetFetchWords, this.snippetFetchWordHashes, null, ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))), @@ -2016,6 +2024,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { final TextSnippet snippet = new TextSnippet( this.loader, page, + this.snippetFetchWords, this.snippetFetchWordHashes, cacheStrategy, ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))), @@ -2032,7 +2041,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener { return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet } else { // problems with snippet fetch - if (this.snippetFetchWordHashes.has(Segment.catchallHash)) { + if (this.snippetFetchWords.contains(Segment.catchallString)) { // we accept that because the word cannot be on the page return page.makeResultEntry(this.query.getSegment(), this.peers, null); } diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 3b7fb9f36..b20f77623 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -126,7 +126,8 @@ public class TextSnippet implements Comparable, Comparator, Comparator queryTerms, final HandleSet queryhashes, final CacheStrategy cacheStrategy, final boolean pre, @@ -175,36 +177,40 @@ public class TextSnippet implements Comparable, Comparator remainingTerms = new HashSet<>(queryTerms); List sentences = null; // try to get the snippet from metadata - removeMatchingHashes(row.url().toTokens(), remainingHashes); - removeMatchingHashes(row.dc_title(), remainingHashes); - removeMatchingHashes(row.dc_creator(), remainingHashes); - removeMatchingHashes(row.dc_subject(), remainingHashes); + removeMatchingTerms(row.url().toTokens(), remainingTerms); + removeMatchingTerms(row.dc_title(), remainingTerms); + removeMatchingTerms(row.dc_creator(), remainingTerms); + removeMatchingTerms(row.dc_subject(), remainingTerms); - if (!remainingHashes.isEmpty()) { + if (!remainingTerms.isEmpty()) { // we did not find everything in the metadata, look further into the document itself. // first acquire the sentences (from description/abstract or text): @@ -245,9 +251,9 @@ public class TextSnippet implements Comparable, Comparator 0) { try { - final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); + final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength); textline = tsr.getSnippet(); - remainingHashes = tsr.getRemainingWords(); + remainingTerms = tsr.getRemainingTerms(); } catch (final UnsupportedOperationException e) { init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime); return; @@ -255,7 +261,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator snippetMaxLength) textline = textline.substring(0, snippetMaxLength); // finally store this snippet in our own cache - snippetsCache.put(wordhashes, urls, textline); + if(wordhashes != null) { + snippetsCache.put(wordhashes, urlHash, textline); + } init(url, textline, false, source, null, beginTime); } @@ -589,17 +597,18 @@ public class TextSnippet implements Comparable, Comparator m = WordTokenizer.hashSentence(sentence, 100).keySet(); - //for (byte[] b: m) System.out.println("sentence hash: " + ASCII.String(b)); - //for (byte[] b: queryhashes) System.out.println("queryhash: " + ASCII.String(b)); - ArrayList o = new ArrayList(queryhashes.size()); - for (final byte[] b : queryhashes) { - if (m.contains(b)) o.add(b); + } + + /** + * Modify the queryTerms set : remove terms present in the given sentence. + * @param sentence a sentence potentially matching some terms of queryTerms + * @param queryTerms a set of normalized terms + */ + private static void removeMatchingTerms(final String sentence, final Set queryTerms) { + if (queryTerms.size() == 0) { + return; } - for (final byte[] b : o) queryhashes.remove(b); + final Set sentenceWords = WordTokenizer.tokenizeSentence(sentence, 100).keySet(); + queryTerms.removeAll(sentenceWords); } } diff --git a/test/java/net/yacy/search/snippet/TextSnippetTest.java b/test/java/net/yacy/search/snippet/TextSnippetTest.java index 0a708e186..f572ee0e2 100644 --- a/test/java/net/yacy/search/snippet/TextSnippetTest.java +++ b/test/java/net/yacy/search/snippet/TextSnippetTest.java @@ -1,20 +1,31 @@ package net.yacy.search.snippet; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.io.IOException; import java.net.MalformedURLException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import org.apache.solr.common.SolrDocument; +import org.junit.Before; +import org.junit.Test; + import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; -import net.yacy.cora.storage.HandleSet; +import net.yacy.cora.protocol.Domains; import net.yacy.cora.util.CommonPattern; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.query.QueryGoal; import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionSchema; -import org.apache.solr.common.SolrDocument; -import static org.junit.Assert.*; -import org.junit.Before; -import org.junit.Test; public class TextSnippetTest { @@ -55,12 +66,12 @@ public class TextSnippetTest { String querywords = "testcase line"; QueryGoal qg = new QueryGoal(querywords); - HandleSet queryhashes = qg.getIncludeHashes(); TextSnippet ts = new TextSnippet( null, testpage, - queryhashes, + qg.getIncludeWordsSet(), + qg.getIncludeHashes(), cacheStrategy, pre, snippetMaxLength, @@ -95,12 +106,12 @@ public class TextSnippetTest { String querywords = "testcase line"; QueryGoal qg = new QueryGoal(querywords); - HandleSet queryhashes = qg.getIncludeHashes(); TextSnippet ts = new TextSnippet( null, testpage, - queryhashes, + qg.getIncludeWordsSet(), + qg.getIncludeHashes(), cacheStrategy, pre, snippetMaxLength, @@ -166,4 +177,65 @@ public class TextSnippetTest { assertTrue ("number (.) broken up",sniptxt.contains("1.83")); assertTrue ("number (,) broken up",sniptxt.contains("3,14")); } + + /** + * Run text snippet extraction from a given plain text file. + * @param args
  1. first element : the plain text file path. When not specified, "test/parsertest/umlaute_linux.txt" is used as default.
  2. + *
  3. other elements : the search terms. When not specified, "Maßkrügen" is used as default
  4. + *
+ * @throws IOException when a read/write error occurred + */ + public static void main(final String args[]) throws IOException { + try { + final SolrDocument doc = new SolrDocument(); + final DigestURL url = new DigestURL("http://localhost/page.html"); + doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash())); + doc.addField(CollectionSchema.sku.name(), url.toNormalform(false)); + + final URIMetadataNode urlEntry = new URIMetadataNode(doc); + urlEntry.addField(CollectionSchema.title.name(), "New test case"); + urlEntry.addField(CollectionSchema.keywords.name(), "junit"); + urlEntry.addField(CollectionSchema.author.name(), "test author"); + + final Path testFilePath; + if(args.length > 0) { + testFilePath = Paths.get(args[0]); + } else { + testFilePath = Paths.get("test/parsertest/umlaute_linux.txt"); + } + + urlEntry.addField(CollectionSchema.text_t.name(), new String(Files.readAllBytes(testFilePath), + StandardCharsets.UTF_8)); + + final StringBuilder queryWords = new StringBuilder(); + if(args.length > 1) { + for(int i = 1; i < args.length; i++) { + if(queryWords.length() > 0) { + queryWords.append(" "); + } + queryWords.append(args[i]); + } + } else { + queryWords.append("Maßkrügen"); + } + + final QueryGoal goal = new QueryGoal(queryWords.toString()); + + System.out.println("Extracting text snippet for terms \"" + queryWords + "\" from file " + testFilePath); + + TextSnippet.statistics.setEnabled(true); + final TextSnippet snippet = new TextSnippet(null, urlEntry, goal.getIncludeWordsSet(), goal.getIncludeHashes(), + CacheStrategy.CACHEONLY, false, SearchEvent.SNIPPET_MAX_LENGTH, false); + System.out.println("Snippet initialized in " + TextSnippet.statistics.getMaxInitTime() + "ms"); + System.out.println("Snippet status : " + snippet.getErrorCode()); + System.out.println("Snippet : " + snippet.descriptionline(goal)); + } finally { + /* Shutdown running threads */ + try { + Domains.close(); + } finally { + ConcurrentLog.shutdown(); + } + } + } }