Reduced memory footprint of text snippet extraction

By not parsing and storing at first all sentences of a document, but
only on the fly the ones necessary to compute the snippet.
pull/137/head
luccioman 7 years ago
parent e115e57cc7
commit e357ade47d

@ -24,28 +24,54 @@
package net.yacy.document; package net.yacy.document;
import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.List;
/**
* Read sentences from a given text.
* This enumerates StringBuilder objects.
*/
public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> { public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringBuilder> {
// read sentences from a given input stream
// this enumerates StringBuilder objects
/** Holds the next element */
private StringBuilder buffer; private StringBuilder buffer;
/** List of already parsed sentences, eventually in addition to those extracted from the main text. */
private List<StringBuilder> parsedSentences;
/** Current position in the parsedSentences list. */
private int sentencesPos;
/** The main text to parse for sentences */
private String text; private String text;
/** The current character position in the main text */
private int pos; private int pos;
/** When true sentences can not include line break characters */
private boolean pre = false; private boolean pre = false;
public SentenceReader(final String text) { public SentenceReader(final String text) {
assert text != null; this(new ArrayList<>(), text, false);
this.text = text;
this.pos = 0;
this.pre = false;
this.buffer = nextElement0();
} }
public SentenceReader(final String text, final boolean pre) { public SentenceReader(final String text, final boolean pre) {
this(text); this(new ArrayList<>(), text, pre);
}
public SentenceReader(final List<StringBuilder> parsedSentences, final String text, final boolean pre) {
assert text != null;
this.text = text;
this.pos = 0;
this.pre = pre; this.pre = pre;
if(parsedSentences == null) {
this.parsedSentences = new ArrayList<>();
} else {
this.parsedSentences = parsedSentences;
}
this.sentencesPos = 0;
this.buffer = nextElement0();
} }
public void pre(final boolean x) { public void pre(final boolean x) {
@ -53,6 +79,12 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
} }
private StringBuilder nextElement0() { private StringBuilder nextElement0() {
if(this.sentencesPos < this.parsedSentences.size()) {
final StringBuilder element = this.parsedSentences.get(this.sentencesPos);
this.sentencesPos++;
return element;
}
final StringBuilder s = new StringBuilder(80); final StringBuilder s = new StringBuilder(80);
int nextChar; int nextChar;
char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' ' char c, lc = ' '; // starting with ' ' as last character prevents that the result string starts with a ' '
@ -73,6 +105,9 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
s.trimToSize(); s.trimToSize();
s.deleteCharAt(s.length() - 1); s.deleteCharAt(s.length() - 1);
} }
/* Add to parsed sentences list for eventual reuse after a reset */
this.parsedSentences.add(s);
this.sentencesPos++;
return s; return s;
} }
@ -119,8 +154,18 @@ public class SentenceReader implements Iterator<StringBuilder>, Iterable<StringB
return this; return this;
} }
/**
* Reset the iterator position to zero
*/
public void reset() {
/* Reset only the sentences position to reuse already parsed sentences */
this.sentencesPos = 0;
this.buffer = nextElement0();
}
public synchronized void close() { public synchronized void close() {
this.text = null; this.text = null;
this.parsedSentences = null;
} }
public static void main(String[] args) { public static void main(String[] args) {

@ -20,7 +20,6 @@
package net.yacy.document; package net.yacy.document;
import java.util.Collection;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
@ -35,8 +34,8 @@ public class SnippetExtractor {
private Set<String> remainingTerms; private Set<String> remainingTerms;
public SnippetExtractor(final Collection<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException { public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null"); if (sentences == null) throw new UnsupportedOperationException("sentences == null");
if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null"); if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
SortedMap<String, Integer> hs; SortedMap<String, Integer> hs;
final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>(); final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();

@ -47,6 +47,7 @@ import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.SnippetExtractor; import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -202,7 +203,8 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// this requires that the document is parsed after loading // this requires that the document is parsed after loading
String textline = null; String textline = null;
Set<String> remainingTerms = new HashSet<>(queryTerms); Set<String> remainingTerms = new HashSet<>(queryTerms);
List<StringBuilder> sentences = null; SentenceReader sentences = null;
List<StringBuilder> firstSentencesList = null;
// try to get the snippet from metadata // try to get the snippet from metadata
removeMatchingTerms(row.url().toTokens(), remainingTerms); removeMatchingTerms(row.url().toTokens(), remainingTerms);
@ -214,15 +216,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// we did not find everything in the metadata, look further into the document itself. // we did not find everything in the metadata, look further into the document itself.
// first acquire the sentences (from description/abstract or text): // first acquire the sentences (from description/abstract or text):
ArrayList<String> solrdesc = row.getDescription(); final ArrayList<String> solrdesc = row.getDescription();
if (!solrdesc.isEmpty()) { // include description_txt (similar to solr highlighting config) if (!solrdesc.isEmpty()) { // include description_txt (similar to solr highlighting config)
sentences = new ArrayList<StringBuilder>(); firstSentencesList = new ArrayList<>();
for (String s:solrdesc) sentences.add(new StringBuilder(s)); for (final String s : solrdesc) {
firstSentencesList.add(new StringBuilder(s));
}
} }
final String solrText = row.getText(); final String solrText = row.getText();
if (solrText != null && solrText.length() > 0) { // TODO: instead of join with desc, we could check if snippet already complete and skip further computation if (solrText != null && solrText.length() > 0) { // TODO: instead of join with desc, we could check if snippet already complete and skip further computation
// compute sentences from solr query // compute sentences from solr query
if (sentences == null) sentences = row.getSentences(pre); else sentences.addAll(row.getSentences(pre)); sentences = new SentenceReader(firstSentencesList, solrText, pre);
} else if (net.yacy.crawler.data.Cache.has(url.hash())) { } else if (net.yacy.crawler.data.Cache.has(url.hash())) {
// get the sentences from the cache // get the sentences from the cache
final Request request = loader == null ? null : loader.request(url, true, reindexing); final Request request = loader == null ? null : loader.request(url, true, reindexing);
@ -236,7 +240,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
if (response != null) { if (response != null) {
try { try {
document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
sentences = document.getSentences(pre); sentences = new SentenceReader(firstSentencesList, document.getTextString(), pre);
response = null; response = null;
document = null; document = null;
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
@ -249,7 +253,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return; return;
} }
if (sentences.size() > 0) { if (sentences.iterator().hasNext()) {
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingTerms, snippetMaxLength);
textline = tsr.getSnippet(); textline = tsr.getSnippet();
@ -265,30 +269,38 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// we found the snippet or the query is fully included in the headline or url // we found the snippet or the query is fully included in the headline or url
if (textline == null || textline.length() == 0) { if (textline == null || textline.length() == 0) {
// this is the case where we don't have a snippet because all search words are included in the headline or the url // this is the case where we don't have a snippet because all search words are included in the headline or the url
if(sentences == null) {
String solrText = row.getText(); String solrText = row.getText();
if (solrText != null && solrText.length() > 0) { if (solrText != null && solrText.length() > 0) {
// compute sentences from solr query // compute sentences from solr query
sentences = row.getSentences(pre); sentences = new SentenceReader(firstSentencesList, solrText, pre);
} }
if (sentences == null || sentences.size() == 0) { } else {
sentences.reset();
}
if (sentences == null || (!sentences.iterator().hasNext())) {
textline = row.dc_subject(); textline = row.dc_subject();
} else { } else {
// use the first lines from the text after the h1 tag as snippet // use the first lines from the text after the h1 tag as snippet
// get first the h1 tag // get first the h1 tag
List<String> h1 = row.h1(); List<String> h1 = row.h1();
if (h1 != null && h1.size() > 0 && sentences.size() > 2) { if (h1 != null && h1.size() > 0) {
// find first appearance of first h1 in sentences and then take the next sentence // find first appearance of first h1 in sentences and then take the next sentence
String h1s = h1.get(0); String h1s = h1.get(0);
if (h1s.length() > 0) { if (h1s.length() > 0) {
solrsearch: for (int i = 0; i < sentences.size() - 2; i++) { String prevSentence = null, currentSentence;
if (sentences.get(i).toString().startsWith(h1s)) { solrsearch: for (final StringBuilder sentence: sentences) {
textline = sentences.get(i + 1).toString(); currentSentence = sentence.toString();
if (prevSentence != null && prevSentence.startsWith(h1s)) {
textline = currentSentence;
break solrsearch; break solrsearch;
} }
prevSentence = currentSentence;
} }
} }
} }
if (textline == null) { if (textline == null) {
sentences.reset();
final StringBuilder s = new StringBuilder(snippetMaxLength); final StringBuilder s = new StringBuilder(snippetMaxLength);
for (final StringBuilder t: sentences) { for (final StringBuilder t: sentences) {
s.append(t).append(' '); s.append(t).append(' ');
@ -344,10 +356,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
// compute sentences from parsed document // compute sentences from parsed document
sentences = document.getSentences(pre); sentences = new SentenceReader(document.getTextString(), pre);
document.close(); document.close();
if (sentences == null) { if (!sentences.hasNext()) {
init(url, null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences", beginTime); init(url, null, false, ResultClass.ERROR_PARSER_NO_LINES, "parser returned no sentences", beginTime);
return; return;
} }

@ -88,7 +88,29 @@ public class TextSnippetTest {
for (String word : wordlist) { for (String word : wordlist) {
assertTrue("testTextSnippet word included " + word, rstr.contains(word)); assertTrue("testTextSnippet word included " + word, rstr.contains(word));
} }
}
/**
* Test snippet extraction when only document title matches searched terms.
* @throws MalformedURLException when the test document URL is malformed. Should not happen.
*/
@Test
public void testTextSnippetMatchTitle() throws MalformedURLException {
final URIMetadataNode testDoc = new URIMetadataNode(doc);
testDoc.addField(CollectionSchema.title.name(), "New test case title");
testDoc.addField(CollectionSchema.keywords.name(), "junit");
testDoc.addField(CollectionSchema.author.name(), "test author");
testDoc.addField(CollectionSchema.text_t.name(),
"A new testcase has been introduced. " + "It includes a few test lines but only title should match.");
final String querywords = "title";
final QueryGoal qg = new QueryGoal(querywords);
final TextSnippet ts = new TextSnippet(null, testDoc, qg.getIncludeWordsSet(), qg.getIncludeHashes(),
cacheStrategy, pre, snippetMaxLength, reindexing);
assertEquals("testTextSnippet Error Code: ", "", ts.getError());
assertTrue("Snippet line should be extracted from first text lines.",
ts.getLineRaw().startsWith("A new testcase has been introduced."));
} }
/** /**

Loading…
Cancel
Save