From e115e57cc7a3bb34406656a701f33f013380691c Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Fri, 11 May 2018 15:42:53 +0200
Subject: [PATCH] Reduced text snippet extraction processing time.

By not generating MD5 hashes on all words of indexed texts, processing
time is reduced by 30 to 50% on indexed documents with more than 1Mbytes
of plain text.
---
 htroot/ViewFile.java                          |  1 +
 .../net/yacy/document/SnippetExtractor.java   | 67 +++++++-------
 source/net/yacy/document/WordTokenizer.java   | 31 +++++++
 source/net/yacy/search/query/QueryGoal.java   | 18 +++-
 source/net/yacy/search/query/SearchEvent.java | 13 ++-
 .../net/yacy/search/snippet/TextSnippet.java  | 79 ++++++++--------
 .../yacy/search/snippet/TextSnippetTest.java  | 90 +++++++++++++++++--
 7 files changed, 215 insertions(+), 84 deletions(-)
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index d99ada761..60cc6cbea 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -350,6 +350,7 @@ public class ViewFile {
                 TextSnippet snippet = new TextSnippet(
                         sb.loader,
                         urlEntry,
+                        goal.getIncludeWordsSet(),
                         goal.getIncludeHashes(),
                         CacheStrategy.CACHEONLY,
                         false,
diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java
index 803fa35bf..4302b4cdd 100644
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@@ -21,26 +21,24 @@
 package net.yacy.document;
 
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
-import net.yacy.cora.storage.HandleSet;
-import net.yacy.cora.util.ConcurrentLog;
-import net.yacy.cora.util.SpaceExceededException;
-import net.yacy.kelondro.index.RowHandleSet;
-
 public class SnippetExtractor {
 
-    String snippetString;
-    HandleSet remainingHashes;
+    private String snippetString;
+    private Set<String> remainingTerms;
 
-    public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
+    
+    public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
         if (sentences == null) throw new UnsupportedOperationException("sentence == null");
-        if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
-        SortedMap<byte[], Integer> hs;
+        if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
+        SortedMap<String, Integer> hs;
         final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
         long uniqCounter = 999L;
         Integer pos;
@@ -48,9 +46,9 @@ public class SnippetExtractor {
         int linenumber = 0;
         int fullmatchcounter = 0;
         lookup: for (final StringBuilder sentence: sentences) {
-            hs = WordTokenizer.hashSentence(sentence.toString(), 100);
+            hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
             positions = new TreeSet<Integer>();
-            for (final byte[] word: queryhashes) {
+            for (final String word: queryTerms) {
                 pos = hs.get(word);
                 if (pos != null) {
                     positions.add(pos);
@@ -65,7 +63,7 @@ public class SnippetExtractor {
             if (!positions.isEmpty()) {
                 order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
                 if (order.size() > 5) order.remove(order.firstEntry().getKey());
-                if (positions.size() == queryhashes.size()) fullmatchcounter++;
+                if (positions.size() == queryTerms.size()) fullmatchcounter++;
                 if (fullmatchcounter >= 3) break lookup;
             }
             linenumber++;
@@ -76,31 +74,31 @@ public class SnippetExtractor {
         while (!order.isEmpty()) {
             sentence = order.remove(order.lastKey()); // sentence with the biggest score
             try {
-                tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
+                tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
             } catch (final UnsupportedOperationException e) {
                 continue;
             }
             this.snippetString = tsr.snippetString;
             if (this.snippetString != null && this.snippetString.length() > 0) {
-                this.remainingHashes = tsr.remainingHashes;
-                if (this.remainingHashes.isEmpty()) {
+                this.remainingTerms = tsr.remainingTerms;
+                if (this.remainingTerms.isEmpty()) {
                     // we have found the snippet
                     return; // finished!
-                } else if (this.remainingHashes.size() < queryhashes.size()) {
+                } else if (this.remainingTerms.size() < queryTerms.size()) {
                     // the result has not all words in it.
                     // find another sentence that represents the missing other words
                     // and find recursively more sentences
                     maxLength = maxLength - this.snippetString.length();
                     if (maxLength < 20) maxLength = 20;
                     try {
-                        tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
+                        tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
                     } catch (final UnsupportedOperationException e) {
                         throw e;
                     }
                     final String nextSnippet = tsr.snippetString;
                     if (nextSnippet == null) return;
                     this.snippetString = this.snippetString + (" / " + nextSnippet);
-                    this.remainingHashes = tsr.remainingHashes;
+                    this.remainingTerms = tsr.remainingTerms;
                     return;
                 } else {
                     // error
@@ -120,27 +118,24 @@ public class SnippetExtractor {
         return 0;
     }
 
-    private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
+    
+    private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
         try {
             if (sentence == null) throw new UnsupportedOperationException("no sentence given");
-            if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
-            byte[] hash;
+            if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
+            String term;
 
             // find all hashes that appear in the sentence
-            final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, 100);
-            final Iterator<byte[]> j = queryhashes.iterator();
+            final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
+            final Iterator<String> j = queryTerms.iterator();
             Integer pos;
             int p, minpos = sentence.length(), maxpos = -1;
-            final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0);
+            final Set<String> remainingTerms = new HashSet<>();
             while (j.hasNext()) {
-                hash = j.next();
-                pos = hs.get(hash);
+                term = j.next();
+                pos = hs.get(term);
                 if (pos == null) {
-                    try {
-                        remainingHashes.put(hash);
-                    } catch (final SpaceExceededException e) {
-                        ConcurrentLog.logException(e);
-                    }
+                   remainingTerms.add(term);
                 } else {
                     p = pos.intValue();
                     if (p > maxpos) maxpos = p;
@@ -185,7 +180,7 @@ public class SnippetExtractor {
                 sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
             }
             this.snippetString = sentence;
-            this.remainingHashes = remainingHashes;
+            this.remainingTerms = remainingTerms;
         } catch (final IndexOutOfBoundsException e) {
             throw new UnsupportedOperationException(e.getMessage());
         }
@@ -195,7 +190,7 @@ public class SnippetExtractor {
         return this.snippetString;
     }
 
-    public HandleSet getRemainingWords() {
-        return this.remainingHashes;
-    }
+    public Set<String> getRemainingTerms() {
+		return this.remainingTerms;
+	}
 }
diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java
index 25caf88ac..0ed51479f 100644
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@@ -27,6 +27,7 @@ package net.yacy.document;
 import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.List;
+import java.util.Locale;
 import java.util.SortedMap;
 import java.util.TreeMap;
 
@@ -210,4 +211,34 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
         	words = null;
         }
     }
+    
+    /**
+     * Tokenize the given sentence and generate a word-wordPos mapping
+     * @param sentence the sentence to be tokenized
+     * @return a ordered map containing word as key and position as value. The map is ordered by words.
+     */
+    public static SortedMap<String, Integer> tokenizeSentence(final String sentence, int maxlength) {
+        final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
+        WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
+        try {
+	        int pos = 0;
+	        String word;
+	        Integer oldpos;
+	        while (words.hasMoreElements() && maxlength-- > 0) {
+	            word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
+
+	            // don't overwrite old values, that leads to too far word distances
+	            oldpos = map.put(word, LargeNumberCache.valueOf(pos));
+	            if (oldpos != null) {
+	                map.put(word, oldpos);
+	            }
+
+	            pos += word.length() + 1;
+	        }
+	        return map;
+        } finally {
+        	words.close();
+        	words = null;
+        }
+    }
 }
diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java
index 9c94540fa..eb47f9ce5 100644
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@@ -251,18 +251,32 @@ public class QueryGoal {
     }
     
     /**
-     * @return a set of words to be included in the search result
+     * @return an iterator on the set of words to be included in the search result
      */
     public Iterator<String> getIncludeWords() {
         return this.include_words.iterator();
     }
+    
+    /**
+     * @return a copy of the set of words to be included in the search result
+     */
+    public Set<String> getIncludeWordsSet() {
+        return new NormalizedWords(this.include_words);
+    }
 
     /**
-     * @return a set of words to be excluded in the search result
+     * @return an iterator on the set of words to be excluded from the search result
      */
     public Iterator<String> getExcludeWords() {
         return this.exclude_words.iterator();
     }
+    
+    /**
+     * @return a copy of the set of words to be excluded from the search result
+     */
+    public Set<String> getExcludeWordsSet() {
+        return new NormalizedWords(this.exclude_words);
+    }
    
     /**
      * @return a list of include strings which reproduces the original order of the search words and quotation
diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java
index 49fe0ede4..7b3895d9b 100644
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@@ -183,6 +183,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
     
     /** a set of word hashes that are used to match with the snippets */
     private final HandleSet                               snippetFetchWordHashes;
+    /** a set of words that are used to match with the snippets */
+    private final Set<String>                             snippetFetchWords;
     private final boolean                                 deleteIfSnippetFail;
     private long                                          urlRetrievalAllTime;
     private long                                          snippetComputationAllTime;
@@ -531,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
         this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
 
         // snippets do not need to match with the complete query hashes,
-        // only with the query minus the stopwords which had not been used for the search       
+        // only with the query minus the stopwords which had not been used for the search 
         boolean filtered = false;
         // check if query contains stopword
         if (Switchboard.stopwordHashes != null) {
@@ -547,6 +549,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
         if (filtered) { // remove stopwords
             this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
         }
+        
+        this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet();
+        // remove stopwords
+        this.snippetFetchWords.removeAll(Switchboard.stopwords);
 
         // clean up events
         SearchEventCache.cleanupEvents(false);
@@ -1877,6 +1883,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
                 final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, "");
                 final TextSnippet yacysnippet = new TextSnippet(this.loader,
                         node,
+                        this.query.getQueryGoal().getIncludeWordsSet(),
                         this.query.getQueryGoal().getIncludeHashes(),
                         CacheStrategy.CACHEONLY,
                         false,
@@ -2000,6 +2007,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
             final TextSnippet snippet = new TextSnippet(
                     null,
                     page,
+                    this.snippetFetchWords,
                     this.snippetFetchWordHashes,
                     null,
                     ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@@ -2016,6 +2024,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
             final TextSnippet snippet = new TextSnippet(
                     this.loader,
                     page,
+                    this.snippetFetchWords,
                     this.snippetFetchWordHashes,
                     cacheStrategy,
                     ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@@ -2032,7 +2041,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
                 return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
             } else {
                 // problems with snippet fetch
-                if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
+                if (this.snippetFetchWords.contains(Segment.catchallString)) {
                     // we accept that because the word cannot be on the page
                     return page.makeResultEntry(this.query.getSegment(), this.peers, null);
                 }
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index 3b7fb9f36..b20f77623 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -126,7 +126,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         SOURCE_WEB(false),
         /** Snippet computed by YaCy from document metadata */
         SOURCE_METADATA(false),
-        ERROR_NO_HASH_GIVEN(true),
+        /** Could not extract a snippet because no search term was provided */
+        ERROR_NO_TERM_GIVEN(true),
         ERROR_SOURCE_LOADING(true),
         ERROR_RESOURCE_LOADING(true),
         ERROR_PARSER_FAILED(true),
@@ -166,6 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
     public TextSnippet(
             final LoaderDispatcher loader,
             final URIMetadataNode row,
+            final Set<String> queryTerms,
             final HandleSet queryhashes,
             final CacheStrategy cacheStrategy,
             final boolean pre,
@@ -175,36 +177,40 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         // heise = "0OQUNU3JSs05"
         
         final DigestURL url = row.url();
-        if (queryhashes.isEmpty()) {
-            //System.out.println("found no queryhashes for URL retrieve " + url);
-            init(url, null, false, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given", beginTime);
+        if (queryTerms.isEmpty()) {
+            init(url, null, false, ResultClass.ERROR_NO_TERM_GIVEN, "no query terms given", beginTime);
             return;
         }
 
         // try to get snippet from snippetCache
         ResultClass source = ResultClass.SOURCE_CACHE;
-        final String wordhashes = RemoteSearch.set2string(queryhashes);
-        final String urls = ASCII.String(url.hash());
-        final String snippetLine = snippetsCache.get(wordhashes, urls);
-        if (snippetLine != null) {
-            // found the snippet
-            init(url, snippetLine, false, source, null, beginTime);
-            return;
+    	final String urlHash = ASCII.String(url.hash());
+        final String wordhashes;
+        if(queryhashes != null) {
+        	wordhashes = RemoteSearch.set2string(queryhashes);
+        	final String snippetLine = snippetsCache.get(wordhashes, urlHash);
+        	if (snippetLine != null) {
+        		// found the snippet
+        		init(url, snippetLine, false, source, null, beginTime);
+        		return;
+        	}
+        } else {
+        	wordhashes = null;
         }
 
         // try to get the snippet from a document at the cache (or in the web)
         // this requires that the document is parsed after loading
         String textline = null;
-        HandleSet remainingHashes = queryhashes.clone();
+        Set<String> remainingTerms = new HashSet<>(queryTerms);
         List<StringBuilder> sentences = null;
         
         // try to get the snippet from metadata
-        removeMatchingHashes(row.url().toTokens(), remainingHashes);
-        removeMatchingHashes(row.dc_title(), remainingHashes);
-        removeMatchingHashes(row.dc_creator(), remainingHashes);
-        removeMatchingHashes(row.dc_subject(), remainingHashes);
+        removeMatchingTerms(row.url().toTokens(), remainingTerms);
+        removeMatchingTerms(row.dc_title(), remainingTerms);
+        removeMatchingTerms(row.dc_creator(), remainingTerms);
+        removeMatchingTerms(row.dc_subject(), remainingTerms);
         
-        if (!remainingHashes.isEmpty()) {
+        if (!remainingTerms.isEmpty()) {
             // we did not find everything in the metadata, look further into the document itself.
 
             // first acquire the sentences (from description/abstract or text):
@@ -245,9 +251,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
 
             if (sentences.size() > 0) {
                 try {
-                    final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
+                    final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
                     textline = tsr.getSnippet();
-                    remainingHashes = tsr.getRemainingWords();
+                    remainingTerms = tsr.getRemainingTerms();
                 } catch (final UnsupportedOperationException e) {
                     init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
                     return;
@@ -255,7 +261,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
             }
        }
 
-       if (remainingHashes.isEmpty()) {
+       if (remainingTerms.isEmpty()) {
             // we found the snippet or the query is fully included in the headline or url
             if (textline == null || textline.length() == 0) {
                 // this is the case where we don't have a snippet because all search words are included in the headline or the url
@@ -347,23 +353,25 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         }
 
         try {
-            final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
+            final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
             textline = tsr.getSnippet();
-            remainingHashes =  tsr.getRemainingWords();
+            remainingTerms =  tsr.getRemainingTerms();
         } catch (final UnsupportedOperationException e) {
             init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
             return;
         }
         sentences = null;
 
-        if (textline == null || !remainingHashes.isEmpty()) {
+        if (textline == null || !remainingTerms.isEmpty()) {
             init(url, null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found", beginTime);
             return;
         }
         if (textline.length() > snippetMaxLength) textline = textline.substring(0, snippetMaxLength);
 
         // finally store this snippet in our own cache
-        snippetsCache.put(wordhashes, urls, textline);
+        if(wordhashes != null) {
+        	snippetsCache.put(wordhashes, urlHash, textline);
+        }
         init(url, textline, false, source, null, beginTime);
     }
 
@@ -589,17 +597,18 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                 CharacterCoding.unicode2html(prefix.toString(), false));
         theWord.append(CharacterCoding.unicode2html(postfix.toString(), false));
         return theWord.toString();
-    }
-
-    private static void removeMatchingHashes(final String sentence, final HandleSet queryhashes) {
-        if (queryhashes.size() == 0) return;
-        final Set<byte[]> m = WordTokenizer.hashSentence(sentence, 100).keySet();
-        //for (byte[] b: m) System.out.println("sentence hash: " + ASCII.String(b));
-        //for (byte[] b: queryhashes) System.out.println("queryhash: " + ASCII.String(b));
-        ArrayList<byte[]> o = new ArrayList<byte[]>(queryhashes.size());
-        for (final byte[] b : queryhashes) {
-            if (m.contains(b)) o.add(b);
+    }    
+    
+    /**
+     * Modify the queryTerms set : remove terms present in the given sentence.
+     * @param sentence a sentence potentially matching some terms of queryTerms
+     * @param queryTerms a set of normalized terms
+     */
+    private static void removeMatchingTerms(final String sentence, final Set<String> queryTerms) {
+        if (queryTerms.size() == 0) {
+        	return;
         }
-        for (final byte[] b : o) queryhashes.remove(b);
+        final Set<String> sentenceWords = WordTokenizer.tokenizeSentence(sentence, 100).keySet();
+        queryTerms.removeAll(sentenceWords);
     }
 }
diff --git a/test/java/net/yacy/search/snippet/TextSnippetTest.java b/test/java/net/yacy/search/snippet/TextSnippetTest.java
index 0a708e186..f572ee0e2 100644
--- a/test/java/net/yacy/search/snippet/TextSnippetTest.java
+++ b/test/java/net/yacy/search/snippet/TextSnippetTest.java
@@ -1,20 +1,31 @@
 
 package net.yacy.search.snippet;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
+
+import java.io.IOException;
 import java.net.MalformedURLException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import org.apache.solr.common.SolrDocument;
+import org.junit.Before;
+import org.junit.Test;
+
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.federate.yacy.CacheStrategy;
-import net.yacy.cora.storage.HandleSet;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.util.CommonPattern;
+import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.query.QueryGoal;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.CollectionSchema;
-import org.apache.solr.common.SolrDocument;
-import static org.junit.Assert.*;
-import org.junit.Before;
-import org.junit.Test;
 
 
 public class TextSnippetTest {
@@ -55,12 +66,12 @@ public class TextSnippetTest {
 
         String querywords = "testcase line";
         QueryGoal qg = new QueryGoal(querywords);
-        HandleSet queryhashes = qg.getIncludeHashes();
 
         TextSnippet ts = new TextSnippet(
                 null,
                 testpage,
-                queryhashes,
+                qg.getIncludeWordsSet(),
+                qg.getIncludeHashes(),
                 cacheStrategy,
                 pre,
                 snippetMaxLength,
@@ -95,12 +106,12 @@ public class TextSnippetTest {
 
         String querywords = "testcase line";
         QueryGoal qg = new QueryGoal(querywords);
-        HandleSet queryhashes = qg.getIncludeHashes();
 
         TextSnippet ts = new TextSnippet(
                 null,
                 testpage,
-                queryhashes,
+                qg.getIncludeWordsSet(),
+                qg.getIncludeHashes(),
                 cacheStrategy,
                 pre,
                 snippetMaxLength,
@@ -166,4 +177,65 @@ public class TextSnippetTest {
         assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
         assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
     }
+    
+	/**
+	 * Run text snippet extraction from a given plain text file.
+	 * @param args <ol><li>first element : the plain text file path. When not specified, "test/parsertest/umlaute_linux.txt" is used as default.</li>
+	 * <li>other elements : the search terms. When not specified, "Maßkrügen" is used as default</li>
+	 * </ol>
+	 * @throws IOException when a read/write error occurred
+	 */
+	public static void main(final String args[]) throws IOException {
+		try {
+			final SolrDocument doc = new SolrDocument();
+			final DigestURL url = new DigestURL("http://localhost/page.html");
+			doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
+			doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
+
+			final URIMetadataNode urlEntry = new URIMetadataNode(doc);
+			urlEntry.addField(CollectionSchema.title.name(), "New test case");
+			urlEntry.addField(CollectionSchema.keywords.name(), "junit");
+			urlEntry.addField(CollectionSchema.author.name(), "test author");
+			
+			final Path testFilePath;
+			if(args.length > 0) {
+				testFilePath = Paths.get(args[0]);
+			} else {
+				testFilePath = Paths.get("test/parsertest/umlaute_linux.txt");
+			}
+			
+			urlEntry.addField(CollectionSchema.text_t.name(), new String(Files.readAllBytes(testFilePath),
+					StandardCharsets.UTF_8));
+			
+			final StringBuilder queryWords = new StringBuilder();
+			if(args.length > 1) {
+				for(int i = 1; i < args.length; i++) {
+					if(queryWords.length() > 0) {
+						queryWords.append(" ");
+					}
+					queryWords.append(args[i]);	
+				}
+			} else {
+				queryWords.append("Maßkrügen");
+			}
+
+			final QueryGoal goal = new QueryGoal(queryWords.toString());
+			
+			System.out.println("Extracting text snippet for terms \"" + queryWords + "\" from file " + testFilePath);
+			
+			TextSnippet.statistics.setEnabled(true);
+			final TextSnippet snippet = new TextSnippet(null, urlEntry, goal.getIncludeWordsSet(), goal.getIncludeHashes(),
+					CacheStrategy.CACHEONLY, false, SearchEvent.SNIPPET_MAX_LENGTH, false);
+			System.out.println("Snippet initialized in " + TextSnippet.statistics.getMaxInitTime() + "ms");
+			System.out.println("Snippet status : " + snippet.getErrorCode());
+			System.out.println("Snippet : " + snippet.descriptionline(goal));
+		} finally {
+			/* Shutdown running threads */
+			try {
+				Domains.close();
+			} finally {
+				ConcurrentLog.shutdown();
+			}
+		}
+	}
 }