From e357ade47d116c9b65b413666e467f9826dbeb9b Mon Sep 17 00:00:00 2001 From: luccioman Date: Sun, 13 May 2018 10:29:52 +0200 Subject: [PATCH] Reduced memory footprint of text snippet extraction By not parsing and storing at first all sentences of a document, but only on the fly the ones necessary to compute the snippet. --- source/net/yacy/document/SentenceReader.java | 63 ++++++++++++++++--- .../net/yacy/document/SnippetExtractor.java | 7 +-- .../net/yacy/search/snippet/TextSnippet.java | 50 +++++++++------ .../yacy/search/snippet/TextSnippetTest.java | 24 ++++++- 4 files changed, 111 insertions(+), 33 deletions(-) diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index 18317bcaf..a8af87d25 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -24,35 +24,67 @@ package net.yacy.document; +import java.util.ArrayList; import java.util.Iterator; +import java.util.List; +/** + * Read sentences from a given text. + * This enumerates StringBuilder objects. + */ public class SentenceReader implements Iterator, Iterable { - // read sentences from a given input stream - // this enumerates StringBuilder objects + /** Holds the next element */ private StringBuilder buffer; + + /** List of already parsed sentences, eventually in addition to those extracted from the main text. */ + private List parsedSentences; + + /** Current position in the parsedSentences list. */ + private int sentencesPos; + + /** The main text to parse for sentences */ private String text; + + /** The current character position in the main text */ private int pos; + + /** When true sentences can not include line break characters */ private boolean pre = false; public SentenceReader(final String text) { - assert text != null; - this.text = text; - this.pos = 0; - this.pre = false; - this.buffer = nextElement0(); + this(new ArrayList<>(), text, false); } public SentenceReader(final String text, final boolean pre) { - this(text); + this(new ArrayList<>(), text, pre); + } + + public SentenceReader(final List parsedSentences, final String text, final boolean pre) { + assert text != null; + this.text = text; + this.pos = 0; this.pre = pre; + if(parsedSentences == null) { + this.parsedSentences = new ArrayList<>(); + } else { + this.parsedSentences = parsedSentences; + } + this.sentencesPos = 0; + this.buffer = nextElement0(); } - + public void pre(final boolean x) { this.pre = x; } private StringBuilder nextElement0() { + if(this.sentencesPos < this.parsedSentences.size()) { + final StringBuilder element = this.parsedSentences.get(this.sentencesPos); + this.sentencesPos++; + return element; + } + final StringBuilder s = new StringBuilder(80); int nextChar; char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' ' @@ -73,6 +105,9 @@ public class SentenceReader implements Iterator, Iterable, Iterable iterator() { return this; } + + /** + * Reset the iterator position to zero + */ + public void reset() { + /* Reset only the sentences position to reuse already parsed sentences */ + this.sentencesPos = 0; + this.buffer = nextElement0(); + } public synchronized void close() { this.text = null; + this.parsedSentences = null; } public static void main(String[] args) { diff --git a/source/net/yacy/document/SnippetExtractor.java b/source/net/yacy/document/SnippetExtractor.java index 4302b4cdd..94a8ca5c1 100644 --- a/source/net/yacy/document/SnippetExtractor.java +++ b/source/net/yacy/document/SnippetExtractor.java @@ -20,7 +20,6 @@ package net.yacy.document; -import java.util.Collection; import java.util.HashSet; import java.util.Iterator; import java.util.Map; @@ -35,8 +34,8 @@ public class SnippetExtractor { private Set remainingTerms; - public SnippetExtractor(final Collection sentences, final Set queryTerms, int maxLength) throws UnsupportedOperationException { - if (sentences == null) throw new UnsupportedOperationException("sentence == null"); + public SnippetExtractor(final Iterable sentences, final Set queryTerms, int maxLength) throws UnsupportedOperationException { + if (sentences == null) throw new UnsupportedOperationException("sentences == null"); if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null"); SortedMap hs; final TreeMap order = new TreeMap(); @@ -45,7 +44,7 @@ public class SnippetExtractor { TreeSet positions; int linenumber = 0; int fullmatchcounter = 0; - lookup: for (final StringBuilder sentence: sentences) { + lookup: for(final StringBuilder sentence : sentences) { hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100); positions = new TreeSet(); for (final String word: queryTerms) { diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index b20f77623..192015faa 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -47,6 +47,7 @@ import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.SentenceReader; import net.yacy.document.SnippetExtractor; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; @@ -202,7 +203,8 @@ public class TextSnippet implements Comparable, Comparator remainingTerms = new HashSet<>(queryTerms); - List sentences = null; + SentenceReader sentences = null; + List firstSentencesList = null; // try to get the snippet from metadata removeMatchingTerms(row.url().toTokens(), remainingTerms); @@ -214,15 +216,17 @@ public class TextSnippet implements Comparable, Comparator solrdesc = row.getDescription(); + final ArrayList solrdesc = row.getDescription(); if (!solrdesc.isEmpty()) { // include description_txt (similar to solr highlighting config) - sentences = new ArrayList(); - for (String s:solrdesc) sentences.add(new StringBuilder(s)); + firstSentencesList = new ArrayList<>(); + for (final String s : solrdesc) { + firstSentencesList.add(new StringBuilder(s)); + } } final String solrText = row.getText(); if (solrText != null && solrText.length() > 0) { // TODO: instead of join with desc, we could check if snippet already complete and skip further computation // compute sentences from solr query - if (sentences == null) sentences = row.getSentences(pre); else sentences.addAll(row.getSentences(pre)); + sentences = new SentenceReader(firstSentencesList, solrText, pre); } else if (net.yacy.crawler.data.Cache.has(url.hash())) { // get the sentences from the cache final Request request = loader == null ? null : loader.request(url, true, reindexing); @@ -236,7 +240,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator 0) { + if (sentences.iterator().hasNext()) { try { final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength); textline = tsr.getSnippet(); @@ -265,30 +269,38 @@ public class TextSnippet implements Comparable, Comparator 0) { - // compute sentences from solr query - sentences = row.getSentences(pre); - } - if (sentences == null || sentences.size() == 0) { + if(sentences == null) { + String solrText = row.getText(); + if (solrText != null && solrText.length() > 0) { + // compute sentences from solr query + sentences = new SentenceReader(firstSentencesList, solrText, pre); + } + } else { + sentences.reset(); + } + if (sentences == null || (!sentences.iterator().hasNext())) { textline = row.dc_subject(); } else { // use the first lines from the text after the h1 tag as snippet // get first the h1 tag List h1 = row.h1(); - if (h1 != null && h1.size() > 0 && sentences.size() > 2) { + if (h1 != null && h1.size() > 0) { // find first appearance of first h1 in sentences and then take the next sentence String h1s = h1.get(0); if (h1s.length() > 0) { - solrsearch: for (int i = 0; i < sentences.size() - 2; i++) { - if (sentences.get(i).toString().startsWith(h1s)) { - textline = sentences.get(i + 1).toString(); + String prevSentence = null, currentSentence; + solrsearch: for (final StringBuilder sentence: sentences) { + currentSentence = sentence.toString(); + if (prevSentence != null && prevSentence.startsWith(h1s)) { + textline = currentSentence; break solrsearch; } + prevSentence = currentSentence; } } } if (textline == null) { + sentences.reset(); final StringBuilder s = new StringBuilder(snippetMaxLength); for (final StringBuilder t: sentences) { s.append(t).append(' '); @@ -344,10 +356,10 @@ public class TextSnippet implements Comparable, Comparator