Reduced text snippet extraction processing time.

By not generating MD5 hashes on all words of indexed texts, processing time is reduced by 30 to 50% on indexed documents with more than 1Mbytes of plain text.
7 years ago · e115e57cc7
parent 7525594315
commit e115e57cc7
7 changed files with 215 additions and 84 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -350,6 +350,7 @@ public class ViewFile {
                TextSnippet snippet = new TextSnippet(
                        sb.loader,
                        urlEntry,
                        goal.getIncludeWordsSet(),
                        goal.getIncludeHashes(),
                        CacheStrategy.CACHEONLY,
                        false,
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@ -21,26 +21,24 @@
 package net.yacy.document;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
 import java.util.TreeSet;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.kelondro.index.RowHandleSet;
 public class SnippetExtractor {
-    String snippetString;
+    private String snippetString;
-    HandleSet remainingHashes;
+    private Set<String> remainingTerms;
-    public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
+    
    public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
        if (sentences == null) throw new UnsupportedOperationException("sentence == null");
-        if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
+        if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
-        SortedMap<byte[], Integer> hs;
+        SortedMap<String, Integer> hs;
        final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
        long uniqCounter = 999L;
        Integer pos;
@ -48,9 +46,9 @@ public class SnippetExtractor {
        int linenumber = 0;
        int fullmatchcounter = 0;
        lookup: for (final StringBuilder sentence: sentences) {
-            hs = WordTokenizer.hashSentence(sentence.toString(), 100);
+            hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
            positions = new TreeSet<Integer>();
-            for (final byte[] word: queryhashes) {
+            for (final String word: queryTerms) {
                pos = hs.get(word);
                if (pos != null) {
                    positions.add(pos);
@ -65,7 +63,7 @@ public class SnippetExtractor {
            if (!positions.isEmpty()) {
                order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
                if (order.size() > 5) order.remove(order.firstEntry().getKey());
-                if (positions.size() == queryhashes.size()) fullmatchcounter++;
+                if (positions.size() == queryTerms.size()) fullmatchcounter++;
                if (fullmatchcounter >= 3) break lookup;
            }
            linenumber++;
@ -76,31 +74,31 @@ public class SnippetExtractor {
        while (!order.isEmpty()) {
            sentence = order.remove(order.lastKey()); // sentence with the biggest score
            try {
-                tsr = new SnippetExtractor(sentence.toString(), queryhashes, maxLength);
+                tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
            } catch (final UnsupportedOperationException e) {
                continue;
            }
            this.snippetString = tsr.snippetString;
            if (this.snippetString != null && this.snippetString.length() > 0) {
-                this.remainingHashes = tsr.remainingHashes;
+                this.remainingTerms = tsr.remainingTerms;
-                if (this.remainingHashes.isEmpty()) {
+                if (this.remainingTerms.isEmpty()) {
                    // we have found the snippet
                    return; // finished!
-                } else if (this.remainingHashes.size() < queryhashes.size()) {
+                } else if (this.remainingTerms.size() < queryTerms.size()) {
                    // the result has not all words in it.
                    // find another sentence that represents the missing other words
                    // and find recursively more sentences
                    maxLength = maxLength - this.snippetString.length();
                    if (maxLength < 20) maxLength = 20;
                    try {
-                        tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
+                        tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
                    } catch (final UnsupportedOperationException e) {
                        throw e;
                    }
                    final String nextSnippet = tsr.snippetString;
                    if (nextSnippet == null) return;
                    this.snippetString = this.snippetString + (" / " + nextSnippet);
-                    this.remainingHashes = tsr.remainingHashes;
+                    this.remainingTerms = tsr.remainingTerms;
                    return;
                } else {
                    // error
@ -120,27 +118,24 @@ public class SnippetExtractor {
        return 0;
    }
-    private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
+    
    private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
        try {
            if (sentence == null) throw new UnsupportedOperationException("no sentence given");
-            if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
+            if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
-            byte[] hash;
+            String term;
            // find all hashes that appear in the sentence
-            final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, 100);
+            final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
-            final Iterator<byte[]> j = queryhashes.iterator();
+            final Iterator<String> j = queryTerms.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
-            final HandleSet remainingHashes = new RowHandleSet(queryhashes.keylen(), queryhashes.comparator(), 0);
+            final Set<String> remainingTerms = new HashSet<>();
            while (j.hasNext()) {
-                hash = j.next();
+                term = j.next();
-                pos = hs.get(hash);
+                pos = hs.get(term);
                if (pos == null) {
-                    try {
+                   remainingTerms.add(term);
                        remainingHashes.put(hash);
                    } catch (final SpaceExceededException e) {
                        ConcurrentLog.logException(e);
                    }
                } else {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
@ -185,7 +180,7 @@ public class SnippetExtractor {
                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
            this.snippetString = sentence;
-            this.remainingHashes = remainingHashes;
+            this.remainingTerms = remainingTerms;
        } catch (final IndexOutOfBoundsException e) {
            throw new UnsupportedOperationException(e.getMessage());
        }
@ -195,7 +190,7 @@ public class SnippetExtractor {
        return this.snippetString;
    }
-    public HandleSet getRemainingWords() {
+    public Set<String> getRemainingTerms() {
-        return this.remainingHashes;
+		return this.remainingTerms;
-    }
+	}
 }
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@ -27,6 +27,7 @@ package net.yacy.document;
 import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.List;
 import java.util.Locale;
 import java.util.SortedMap;
 import java.util.TreeMap;
@ -210,4 +211,34 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
        	words = null;
        }
    }
    /**
     * Tokenize the given sentence and generate a word-wordPos mapping
     * @param sentence the sentence to be tokenized
     * @return a ordered map containing word as key and position as value. The map is ordered by words.
     */
    public static SortedMap<String, Integer> tokenizeSentence(final String sentence, int maxlength) {
        final SortedMap<String, Integer> map = new TreeMap<String, Integer>();
        WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), null);
        try {
 	        int pos = 0;
 	        String word;
 	        Integer oldpos;
 	        while (words.hasMoreElements() && maxlength-- > 0) {
 	            word = words.nextElement().toString().toLowerCase(Locale.ENGLISH);
 	            // don't overwrite old values, that leads to too far word distances
 	            oldpos = map.put(word, LargeNumberCache.valueOf(pos));
 	            if (oldpos != null) {
 	                map.put(word, oldpos);
 	            }
 	            pos += word.length() + 1;
 	        }
 	        return map;
        } finally {
        	words.close();
        	words = null;
        }
    }
 }
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@ -251,18 +251,32 @@ public class QueryGoal {
    }
    /**
-     * @return a set of words to be included in the search result
+     * @return an iterator on the set of words to be included in the search result
     */
    public Iterator<String> getIncludeWords() {
        return this.include_words.iterator();
    }
    /**
     * @return a copy of the set of words to be included in the search result
     */
    public Set<String> getIncludeWordsSet() {
        return new NormalizedWords(this.include_words);
    }
    /**
-     * @return a set of words to be excluded in the search result
+     * @return an iterator on the set of words to be excluded from the search result
     */
    public Iterator<String> getExcludeWords() {
        return this.exclude_words.iterator();
    }
    /**
     * @return a copy of the set of words to be excluded from the search result
     */
    public Set<String> getExcludeWordsSet() {
        return new NormalizedWords(this.exclude_words);
    }
    /**
     * @return a list of include strings which reproduces the original order of the search words and quotation
--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -183,6 +183,8 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
    /** a set of word hashes that are used to match with the snippets */
    private final HandleSet                               snippetFetchWordHashes;
    /** a set of words that are used to match with the snippets */
    private final Set<String>                             snippetFetchWords;
    private final boolean                                 deleteIfSnippetFail;
    private long                                          urlRetrievalAllTime;
    private long                                          snippetComputationAllTime;
@ -531,7 +533,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
        this.resultList = new WeakPriorityBlockingQueue<URIMetadataNode>(Math.max(max_results_node, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking
        // snippets do not need to match with the complete query hashes,
-        // only with the query minus the stopwords which had not been used for the search       
+        // only with the query minus the stopwords which had not been used for the search 
        boolean filtered = false;
        // check if query contains stopword
        if (Switchboard.stopwordHashes != null) {
@ -547,6 +549,10 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
        if (filtered) { // remove stopwords
            this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
        }
        this.snippetFetchWords = query.getQueryGoal().getIncludeWordsSet();
        // remove stopwords
        this.snippetFetchWords.removeAll(Switchboard.stopwords);
        // clean up events
        SearchEventCache.cleanupEvents(false);
@ -1877,6 +1883,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
                final TextSnippet solrsnippet = new TextSnippet(node.url(), OpensearchResponseWriter.getLargestSnippet(solrsnippetlines), true, ResultClass.SOURCE_SOLR, "");
                final TextSnippet yacysnippet = new TextSnippet(this.loader,
                        node,
                        this.query.getQueryGoal().getIncludeWordsSet(),
                        this.query.getQueryGoal().getIncludeHashes(),
                        CacheStrategy.CACHEONLY,
                        false,
@ -2000,6 +2007,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
            final TextSnippet snippet = new TextSnippet(
                    null,
                    page,
                    this.snippetFetchWords,
                    this.snippetFetchWordHashes,
                    null,
                    ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2016,6 +2024,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
            final TextSnippet snippet = new TextSnippet(
                    this.loader,
                    page,
                    this.snippetFetchWords,
                    this.snippetFetchWordHashes,
                    cacheStrategy,
                    ((this.query.constraint != null) && (this.query.constraint.get(Tokenizer.flag_cat_indexof))),
@ -2032,7 +2041,7 @@ public final class SearchEvent implements ScoreMapUpdatesListener {
                return page.makeResultEntry(this.query.getSegment(), this.peers, null); // result without snippet
            } else {
                // problems with snippet fetch
-                if (this.snippetFetchWordHashes.has(Segment.catchallHash)) {
+                if (this.snippetFetchWords.contains(Segment.catchallString)) {
                    // we accept that because the word cannot be on the page
                    return page.makeResultEntry(this.query.getSegment(), this.peers, null);
                }
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@ -126,7 +126,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        SOURCE_WEB(false),
        /** Snippet computed by YaCy from document metadata */
        SOURCE_METADATA(false),
-        ERROR_NO_HASH_GIVEN(true),
+        /** Could not extract a snippet because no search term was provided */
        ERROR_NO_TERM_GIVEN(true),
        ERROR_SOURCE_LOADING(true),
        ERROR_RESOURCE_LOADING(true),
        ERROR_PARSER_FAILED(true),
@ -166,6 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
    public TextSnippet(
            final LoaderDispatcher loader,
            final URIMetadataNode row,
            final Set<String> queryTerms,
            final HandleSet queryhashes,
            final CacheStrategy cacheStrategy,
            final boolean pre,
@ -175,36 +177,40 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        // heise = "0OQUNU3JSs05"
        final DigestURL url = row.url();
-        if (queryhashes.isEmpty()) {
+        if (queryTerms.isEmpty()) {
-            //System.out.println("found no queryhashes for URL retrieve " + url);
+            init(url, null, false, ResultClass.ERROR_NO_TERM_GIVEN, "no query terms given", beginTime);
            init(url, null, false, ResultClass.ERROR_NO_HASH_GIVEN, "no query hashes given", beginTime);
            return;
        }
        // try to get snippet from snippetCache
        ResultClass source = ResultClass.SOURCE_CACHE;
-        final String wordhashes = RemoteSearch.set2string(queryhashes);
+    	final String urlHash = ASCII.String(url.hash());
-        final String urls = ASCII.String(url.hash());
+        final String wordhashes;
-        final String snippetLine = snippetsCache.get(wordhashes, urls);
+        if(queryhashes != null) {
-        if (snippetLine != null) {
+        	wordhashes = RemoteSearch.set2string(queryhashes);
-            // found the snippet
+        	final String snippetLine = snippetsCache.get(wordhashes, urlHash);
-            init(url, snippetLine, false, source, null, beginTime);
+        	if (snippetLine != null) {
-            return;
+        		// found the snippet
        		init(url, snippetLine, false, source, null, beginTime);
        		return;
        	}
        } else {
        	wordhashes = null;
        }
        // try to get the snippet from a document at the cache (or in the web)
        // this requires that the document is parsed after loading
        String textline = null;
-        HandleSet remainingHashes = queryhashes.clone();
+        Set<String> remainingTerms = new HashSet<>(queryTerms);
        List<StringBuilder> sentences = null;
        // try to get the snippet from metadata
-        removeMatchingHashes(row.url().toTokens(), remainingHashes);
+        removeMatchingTerms(row.url().toTokens(), remainingTerms);
-        removeMatchingHashes(row.dc_title(), remainingHashes);
+        removeMatchingTerms(row.dc_title(), remainingTerms);
-        removeMatchingHashes(row.dc_creator(), remainingHashes);
+        removeMatchingTerms(row.dc_creator(), remainingTerms);
-        removeMatchingHashes(row.dc_subject(), remainingHashes);
+        removeMatchingTerms(row.dc_subject(), remainingTerms);
-        if (!remainingHashes.isEmpty()) {
+        if (!remainingTerms.isEmpty()) {
            // we did not find everything in the metadata, look further into the document itself.
            // first acquire the sentences (from description/abstract or text):
@ -245,9 +251,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            if (sentences.size() > 0) {
                try {
-                    final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
+                    final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
                    textline = tsr.getSnippet();
-                    remainingHashes = tsr.getRemainingWords();
+                    remainingTerms = tsr.getRemainingTerms();
                } catch (final UnsupportedOperationException e) {
                    init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
                    return;
@ -255,7 +261,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            }
       }
-       if (remainingHashes.isEmpty()) {
+       if (remainingTerms.isEmpty()) {
            // we found the snippet or the query is fully included in the headline or url
            if (textline == null || textline.length() == 0) {
                // this is the case where we don't have a snippet because all search words are included in the headline or the url
@ -347,23 +353,25 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        }
        try {
-            final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
+            final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
            textline = tsr.getSnippet();
-            remainingHashes =  tsr.getRemainingWords();
+            remainingTerms =  tsr.getRemainingTerms();
        } catch (final UnsupportedOperationException e) {
            init(url, null, false, ResultClass.ERROR_NO_MATCH, "snippet extractor failed:" + e.getMessage(), beginTime);
            return;
        }
        sentences = null;
-        if (textline == null || !remainingHashes.isEmpty()) {
+        if (textline == null || !remainingTerms.isEmpty()) {
            init(url, null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found", beginTime);
            return;
        }
        if (textline.length() > snippetMaxLength) textline = textline.substring(0, snippetMaxLength);
        // finally store this snippet in our own cache
-        snippetsCache.put(wordhashes, urls, textline);
+        if(wordhashes != null) {
        	snippetsCache.put(wordhashes, urlHash, textline);
        }
        init(url, textline, false, source, null, beginTime);
    }
@ -589,17 +597,18 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                CharacterCoding.unicode2html(prefix.toString(), false));
        theWord.append(CharacterCoding.unicode2html(postfix.toString(), false));
        return theWord.toString();
-    }
+    }    
-
+    
-    private static void removeMatchingHashes(final String sentence, final HandleSet queryhashes) {
+    /**
-        if (queryhashes.size() == 0) return;
+     * Modify the queryTerms set : remove terms present in the given sentence.
-        final Set<byte[]> m = WordTokenizer.hashSentence(sentence, 100).keySet();
+     * @param sentence a sentence potentially matching some terms of queryTerms
-        //for (byte[] b: m) System.out.println("sentence hash: " + ASCII.String(b));
+     * @param queryTerms a set of normalized terms
-        //for (byte[] b: queryhashes) System.out.println("queryhash: " + ASCII.String(b));
+     */
-        ArrayList<byte[]> o = new ArrayList<byte[]>(queryhashes.size());
+    private static void removeMatchingTerms(final String sentence, final Set<String> queryTerms) {
-        for (final byte[] b : queryhashes) {
+        if (queryTerms.size() == 0) {
-            if (m.contains(b)) o.add(b);
+        	return;
        }
-        for (final byte[] b : o) queryhashes.remove(b);
+        final Set<String> sentenceWords = WordTokenizer.tokenizeSentence(sentence, 100).keySet();
        queryTerms.removeAll(sentenceWords);
    }
 }
--- a/test/java/net/yacy/search/snippet/TextSnippetTest.java
+++ b/test/java/net/yacy/search/snippet/TextSnippetTest.java
@ -1,20 +1,31 @@
 package net.yacy.search.snippet;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import org.apache.solr.common.SolrDocument;
 import org.junit.Before;
 import org.junit.Test;
 import net.yacy.cora.document.encoding.ASCII;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.federate.yacy.CacheStrategy;
-import net.yacy.cora.storage.HandleSet;
+import net.yacy.cora.protocol.Domains;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.search.query.QueryGoal;
 import net.yacy.search.query.SearchEvent;
 import net.yacy.search.schema.CollectionSchema;
 import org.apache.solr.common.SolrDocument;
 import static org.junit.Assert.*;
 import org.junit.Before;
 import org.junit.Test;
 public class TextSnippetTest {
@ -55,12 +66,12 @@ public class TextSnippetTest {
        String querywords = "testcase line";
        QueryGoal qg = new QueryGoal(querywords);
        HandleSet queryhashes = qg.getIncludeHashes();
        TextSnippet ts = new TextSnippet(
                null,
                testpage,
-                queryhashes,
+                qg.getIncludeWordsSet(),
                qg.getIncludeHashes(),
                cacheStrategy,
                pre,
                snippetMaxLength,
@ -95,12 +106,12 @@ public class TextSnippetTest {
        String querywords = "testcase line";
        QueryGoal qg = new QueryGoal(querywords);
        HandleSet queryhashes = qg.getIncludeHashes();
        TextSnippet ts = new TextSnippet(
                null,
                testpage,
-                queryhashes,
+                qg.getIncludeWordsSet(),
                qg.getIncludeHashes(),
                cacheStrategy,
                pre,
                snippetMaxLength,
@ -166,4 +177,65 @@ public class TextSnippetTest {
        assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
        assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
    }
 	/**
 	 * Run text snippet extraction from a given plain text file.
 	 * @param args <ol><li>first element : the plain text file path. When not specified, "test/parsertest/umlaute_linux.txt" is used as default.</li>
 	 * <li>other elements : the search terms. When not specified, "Maßkrügen" is used as default</li>
 	 * </ol>
 	 * @throws IOException when a read/write error occurred
 	 */
 	public static void main(final String args[]) throws IOException {
 		try {
 			final SolrDocument doc = new SolrDocument();
 			final DigestURL url = new DigestURL("http://localhost/page.html");
 			doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
 			doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
 			final URIMetadataNode urlEntry = new URIMetadataNode(doc);
 			urlEntry.addField(CollectionSchema.title.name(), "New test case");
 			urlEntry.addField(CollectionSchema.keywords.name(), "junit");
 			urlEntry.addField(CollectionSchema.author.name(), "test author");
 			final Path testFilePath;
 			if(args.length > 0) {
 				testFilePath = Paths.get(args[0]);
 			} else {
 				testFilePath = Paths.get("test/parsertest/umlaute_linux.txt");
 			}
 			urlEntry.addField(CollectionSchema.text_t.name(), new String(Files.readAllBytes(testFilePath),
 					StandardCharsets.UTF_8));
 			final StringBuilder queryWords = new StringBuilder();
 			if(args.length > 1) {
 				for(int i = 1; i < args.length; i++) {
 					if(queryWords.length() > 0) {
 						queryWords.append(" ");
 					}
 					queryWords.append(args[i]);	
 				}
 			} else {
 				queryWords.append("Maßkrügen");
 			}
 			final QueryGoal goal = new QueryGoal(queryWords.toString());
 			System.out.println("Extracting text snippet for terms \"" + queryWords + "\" from file " + testFilePath);
 			TextSnippet.statistics.setEnabled(true);
 			final TextSnippet snippet = new TextSnippet(null, urlEntry, goal.getIncludeWordsSet(), goal.getIncludeHashes(),
 					CacheStrategy.CACHEONLY, false, SearchEvent.SNIPPET_MAX_LENGTH, false);
 			System.out.println("Snippet initialized in " + TextSnippet.statistics.getMaxInitTime() + "ms");
 			System.out.println("Snippet status : " + snippet.getErrorCode());
 			System.out.println("Snippet : " + snippet.descriptionline(goal));
 		} finally {
 			/* Shutdown running threads */
 			try {
 				Domains.close();
 			} finally {
 				ConcurrentLog.shutdown();
 			}
 		}
 	}
 }