From e357ade47d116c9b65b413666e467f9826dbeb9b Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Sun, 13 May 2018 10:29:52 +0200
Subject: [PATCH] Reduced memory footprint of text snippet extraction

By not parsing and storing at first all sentences of a document, but
only on the fly the ones necessary to compute the snippet.
---
 source/net/yacy/document/SentenceReader.java  | 63 ++++++++++++++++---
 .../net/yacy/document/SnippetExtractor.java   |  7 +--
 .../net/yacy/search/snippet/TextSnippet.java  | 50 +++++++++------
 .../yacy/search/snippet/TextSnippetTest.java  | 24 ++++++-
 4 files changed, 111 insertions(+), 33 deletions(-)
diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java
index 18317bcaf..a8af87d25 100644
--- a/source/net/yacy/document/SentenceReader.java
+++ b/source/net/yacy/document/SentenceReader.java
@@ -24,35 +24,67 @@
 
 package net.yacy.document;
 
+import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.List;
 
+/**
+ * Read sentences from a given text.
+ * This enumerates StringBuilder objects. 
+ */
 public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
-    // read sentences from a given input stream
-    // this enumerates StringBuilder objects
 
+	/** Holds the next element */
     private StringBuilder buffer;
+    
+    /** List of already parsed sentences, eventually in addition to those extracted from the main text. */
+    private List<StringBuilder> parsedSentences;
+    
+    /** Current position in the parsedSentences list. */
+    private int sentencesPos;
+    
+    /** The main text to parse for sentences */
     private String text;
+    
+    /** The current character position in the main text */
     private int pos;
+    
+    /** When true sentences can not include line break characters */
     private boolean pre = false;
 
     public SentenceReader(final String text) {
-    	assert text != null;
-        this.text = text;
-        this.pos = 0;
-        this.pre = false;
-        this.buffer = nextElement0();
+    	this(new ArrayList<>(), text, false);
     }
 
     public SentenceReader(final String text, final boolean pre) {
-    	this(text);
+    	this(new ArrayList<>(), text, pre);
+    }
+    
+    public SentenceReader(final List<StringBuilder> parsedSentences, final String text, final boolean pre) {
+    	assert text != null;
+        this.text = text;
+        this.pos = 0;
         this.pre = pre;
+        if(parsedSentences == null) {
+        	this.parsedSentences = new ArrayList<>();
+        } else {
+        	this.parsedSentences = parsedSentences;
+        }
+        this.sentencesPos = 0;
+        this.buffer = nextElement0();
     }
-
+    
     public void pre(final boolean x) {
         this.pre = x;
     }
 
     private StringBuilder nextElement0() {
+    	if(this.sentencesPos < this.parsedSentences.size()) {
+    		final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
+    		this.sentencesPos++;
+    		return element;
+    	}
+    	
         final StringBuilder s = new StringBuilder(80);
         int nextChar;
         char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
@@ -73,6 +105,9 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
             s.trimToSize();
             s.deleteCharAt(s.length() - 1);
         }
+        /* Add to parsed sentences list for eventual reuse after a reset */
+        this.parsedSentences.add(s);
+        this.sentencesPos++;
         return s;
     }
 
@@ -118,9 +153,19 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
     public Iterator<StringBuilder> iterator() {
         return this;
     }
+    
+    /**
+     * Reset the iterator position to zero
+     */
+    public void reset() {
+   		/* Reset only the sentences position to reuse already parsed sentences */
+   		this.sentencesPos = 0;
+   		this.buffer = nextElement0();
+    }
 
     public synchronized void close() {
     	this.text = null;
+    	this.parsedSentences = null;
     }
 
     public static void main(String[] args) {
diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java
index 4302b4cdd..94a8ca5c1 100644
--- a/source/net/yacy/document/SnippetExtractor.java
+++ b/source/net/yacy/document/SnippetExtractor.java
@@ -20,7 +20,6 @@
 
 package net.yacy.document;
 
-import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
@@ -35,8 +34,8 @@ public class SnippetExtractor {
     private Set<String> remainingTerms;
 
     
-    public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
-        if (sentences == null) throw new UnsupportedOperationException("sentence == null");
+    public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
+        if (sentences == null) throw new UnsupportedOperationException("sentences == null");
         if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
         SortedMap<String, Integer> hs;
         final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
@@ -45,7 +44,7 @@ public class SnippetExtractor {
         TreeSet<Integer> positions;
         int linenumber = 0;
         int fullmatchcounter = 0;
-        lookup: for (final StringBuilder sentence: sentences) {
+        lookup: for(final StringBuilder sentence : sentences) {
             hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
             positions = new TreeSet<Integer>();
             for (final String word: queryTerms) {
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index b20f77623..192015faa 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -47,6 +47,7 @@ import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
+import net.yacy.document.SentenceReader;
 import net.yacy.document.SnippetExtractor;
 import net.yacy.document.WordTokenizer;
 import net.yacy.document.parser.html.CharacterCoding;
@@ -202,7 +203,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         // this requires that the document is parsed after loading
         String textline = null;
         Set<String> remainingTerms = new HashSet<>(queryTerms);
-        List<StringBuilder> sentences = null;
+        SentenceReader sentences = null;
+        List<StringBuilder> firstSentencesList = null;
         
         // try to get the snippet from metadata
         removeMatchingTerms(row.url().toTokens(), remainingTerms);
@@ -214,15 +216,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
             // we did not find everything in the metadata, look further into the document itself.
 
             // first acquire the sentences (from description/abstract or text):
-            ArrayList<String> solrdesc = row.getDescription();
+            final ArrayList<String> solrdesc = row.getDescription();
             if (!solrdesc.isEmpty()) { // include description_txt (similar to solr highlighting config)
-                sentences = new ArrayList<StringBuilder>();
-                for (String s:solrdesc) sentences.add(new StringBuilder(s));
+            	firstSentencesList = new ArrayList<>();
+                for (final String s : solrdesc) {
+                	firstSentencesList.add(new StringBuilder(s));
+                }
             }
             final String solrText = row.getText();
             if (solrText != null && solrText.length() > 0) { // TODO: instead of join with desc, we could check if snippet already complete and skip further computation
                 // compute sentences from solr query
-                if (sentences == null) sentences = row.getSentences(pre); else sentences.addAll(row.getSentences(pre));
+               	sentences = new SentenceReader(firstSentencesList, solrText, pre);
             } else if (net.yacy.crawler.data.Cache.has(url.hash())) {
                 // get the sentences from the cache
                 final Request request = loader == null ? null : loader.request(url, true, reindexing);
@@ -236,7 +240,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                 if (response != null) {
                     try {
                         document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
-                        sentences = document.getSentences(pre);
+                        sentences = new SentenceReader(firstSentencesList, document.getTextString(), pre);
                         response = null;
                         document = null;
                     } catch (final Parser.Failure e) {
@@ -249,7 +253,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                 return;
             }
 
-            if (sentences.size() > 0) {
+            if (sentences.iterator().hasNext()) {
                 try {
                     final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
                     textline = tsr.getSnippet();
@@ -265,30 +269,38 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
             // we found the snippet or the query is fully included in the headline or url
             if (textline == null || textline.length() == 0) {
                 // this is the case where we don't have a snippet because all search words are included in the headline or the url
-                String solrText = row.getText();
-                if (solrText != null && solrText.length() > 0) {
-                    // compute sentences from solr query
-                    sentences = row.getSentences(pre);
-                }
-                if (sentences == null || sentences.size() == 0) {
+            	if(sentences == null) {
+            		String solrText = row.getText();
+            		if (solrText != null && solrText.length() > 0) {
+            			// compute sentences from solr query
+            			sentences = new SentenceReader(firstSentencesList, solrText, pre);
+            		}
+            	} else {
+                	sentences.reset();
+            	}
+                if (sentences == null || (!sentences.iterator().hasNext())) {
                     textline = row.dc_subject();
                 } else {
                     // use the first lines from the text after the h1 tag as snippet
                     // get first the h1 tag
                     List<String> h1 = row.h1();
-                    if (h1 != null && h1.size() > 0 && sentences.size() > 2) {
+                    if (h1 != null && h1.size() > 0) {
                         // find first appearance of first h1 in sentences and then take the next sentence
                         String h1s = h1.get(0);
                         if (h1s.length() > 0) {
-                            solrsearch: for (int i = 0; i < sentences.size() - 2; i++) {
-                                if (sentences.get(i).toString().startsWith(h1s)) {
-                                    textline = sentences.get(i + 1).toString();
+                        	String prevSentence = null, currentSentence;
+                            solrsearch: for (final StringBuilder sentence: sentences) {
+                            	currentSentence = sentence.toString();
+                                if (prevSentence != null && prevSentence.startsWith(h1s)) {
+                                    textline = currentSentence;
                                     break solrsearch;
                                 }
+                                prevSentence = currentSentence;
                             }
                         }
                     }
                     if (textline == null) {
+                    	sentences.reset();
                         final StringBuilder s = new StringBuilder(snippetMaxLength);
                         for (final StringBuilder t: sentences) {
                         	s.append(t).append(' ');
@@ -344,10 +356,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         }
 
         // compute sentences from parsed document
-        sentences = document.getSentences(pre);
+        sentences = new SentenceReader(document.getTextString(), pre);
         document.close();
 
-        if (sentences == null) {
+        if (!sentences.hasNext()) {
             init(url, null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences", beginTime);
             return;
         }
diff --git a/test/java/net/yacy/search/snippet/TextSnippetTest.java b/test/java/net/yacy/search/snippet/TextSnippetTest.java
index f572ee0e2..2d12c5d24 100644
--- a/test/java/net/yacy/search/snippet/TextSnippetTest.java
+++ b/test/java/net/yacy/search/snippet/TextSnippetTest.java
@@ -88,8 +88,30 @@ public class TextSnippetTest {
         for (String word : wordlist) {
             assertTrue("testTextSnippet word included " + word, rstr.contains(word));
         }
-
     }
+    
+    /**
+     * Test snippet extraction when only document title matches searched terms.
+     * @throws MalformedURLException when the test document URL is malformed. Should not happen.
+     */
+	@Test
+	public void testTextSnippetMatchTitle() throws MalformedURLException {
+		final URIMetadataNode testDoc = new URIMetadataNode(doc);
+		testDoc.addField(CollectionSchema.title.name(), "New test case title");
+		testDoc.addField(CollectionSchema.keywords.name(), "junit");
+		testDoc.addField(CollectionSchema.author.name(), "test author");
+		testDoc.addField(CollectionSchema.text_t.name(),
+				"A new testcase has been introduced. " + "It includes a few test lines but only title should match.");
+
+		final String querywords = "title";
+		final QueryGoal qg = new QueryGoal(querywords);
+
+		final TextSnippet ts = new TextSnippet(null, testDoc, qg.getIncludeWordsSet(), qg.getIncludeHashes(),
+				cacheStrategy, pre, snippetMaxLength, reindexing);
+		assertEquals("testTextSnippet Error Code: ", "", ts.getError());
+		assertTrue("Snippet line should be extracted from first text lines.",
+				ts.getLineRaw().startsWith("A new testcase has been introduced."));
+	}
 
     /**
      * Test of getLineMarked method, of class TextSnippet.