From 9bece5ac5f7b9e289d7db26b88fa6d9b2328b16b Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 17 Aug 2012 14:22:07 +0200 Subject: [PATCH] enhanced snippet fetch - removed a bug that caused documents to be parsed even if a solr text was available --- .../kelondro/data/meta/URIMetadataNode.java | 4 + .../net/yacy/search/query/SnippetProcess.java | 21 +++--- .../net/yacy/search/snippet/TextSnippet.java | 75 +++++++++++-------- 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 2c86756c0..25936eb5e 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -99,6 +99,10 @@ public class URIMetadataNode implements URIMetadata { this.ranking = ranking; } + public SolrDocument getDocument() { + return this.doc; + } + private int getInt(YaCySchema field) { Integer x = (Integer) this.doc.getFieldValue(field.name()); if (x == null) return 0; diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index 23fc76a01..4498ac730 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.cora.services.federated.solr.SolrConnector; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; @@ -45,6 +44,7 @@ import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.logging.Log; @@ -54,6 +54,7 @@ import net.yacy.peers.graphics.ProfilingGraph; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.EventTracker; import net.yacy.search.Switchboard; +import net.yacy.search.index.MetadataRepository; import net.yacy.search.index.Segment; import net.yacy.search.snippet.MediaSnippet; import net.yacy.search.snippet.ResultEntry; @@ -439,7 +440,7 @@ public class SnippetProcess { private final CacheStrategy cacheStrategy; private final int neededResults; private boolean shallrun; - private final SolrConnector solr; + private final MetadataRepository metadata; public Worker(final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) { this.cacheStrategy = cacheStrategy; @@ -447,7 +448,7 @@ public class SnippetProcess { this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.neededResults = neededResults; this.shallrun = true; - this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getSolr(); + this.metadata = SnippetProcess.this.rankingProcess.getQuery().getSegment().urlMetadata(); } @Override @@ -496,16 +497,18 @@ public class SnippetProcess { // in case that we have an attached solr, we load also the solr document String solrContent = null; - if (this.solr != null) { - SolrDocument sd = null; + SolrDocument sd = null; + if (page instanceof URIMetadataNode) { + sd = ((URIMetadataNode) page).getDocument(); + } else { try { - sd = this.solr.get(ASCII.String(page.hash())); + sd = this.metadata.getSolr().get(ASCII.String(page.hash())); } catch (IOException e) { Log.logException(e); } - if (sd != null) { - solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd); - } + } + if (sd != null) { + solrContent = this.metadata.getSolrScheme().solrGetText(sd); } resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 9f55a2cad..167316047 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -179,37 +179,24 @@ public class TextSnippet implements Comparable, Comparator sentences = null; - // try the solr text first - if (solrText != null) { - // compute sentences from solr query - final SentenceReader sr = new SentenceReader(solrText, pre); - sentences = new ArrayList(); - while (sr.hasNext()) { - sentences.add(sr.next()); - } - - if (sentences != null) { - try { - final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); - textline = tsr.getSnippet(); - remainingHashes = tsr.getRemainingWords(); - } catch (final UnsupportedOperationException e) { - init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); - return; - } - } - } - // try to get the snippet from metadata removeMatchingHashes(row.dc_title(), remainingHashes); removeMatchingHashes(row.dc_creator(), remainingHashes); removeMatchingHashes(row.dc_subject(), remainingHashes); removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes); - if (remainingHashes.isEmpty()) { - // the snippet is fully inside the metadata! + if (!remainingHashes.isEmpty()) { + // we did not find everything in the metadata, look further into the document itself. - if (de.anomic.crawler.Cache.has(url.hash())) { + // first acquire the sentences: + if (solrText != null) { + // compute sentences from solr query + final SentenceReader sr = new SentenceReader(solrText, pre); + sentences = new ArrayList(); + while (sr.hasNext()) { + sentences.add(sr.next()); + } + } else if (de.anomic.crawler.Cache.has(url.hash())) { // get the sentences from the cache final Request request = loader == null ? null : loader.request(url, true, reindexing); Response response; @@ -227,19 +214,41 @@ public class TextSnippet implements Comparable, Comparator= snippetMaxLength / 4 * 3) break; + + if (sentences.size() > 0) { + try { + final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); + textline = tsr.getSnippet(); + remainingHashes = tsr.getRemainingWords(); + } catch (final UnsupportedOperationException e) { + init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found"); + return; + } + } + } + + if (remainingHashes.isEmpty()) { + // we found the snippet + if (textline == null) { + if (sentences == null) { + textline = row.dc_subject(); + } else { + // use the first lines from the text as snippet + final StringBuilder s = new StringBuilder(snippetMaxLength); + for (final StringBuilder t: sentences) { + s.append(t).append(' '); + if (s.length() >= snippetMaxLength / 4 * 3) break; + } + if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); } + textline = s.toString(); + } } - if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); } - init(url.hash(), s.length() > 0 ? s.toString() : this.line, ResultClass.SOURCE_METADATA, null); + init(url.hash(), textline.length() > 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null); return; } @@ -313,7 +322,7 @@ public class TextSnippet implements Comparable, Comparator" + videoline; //if (appline != null) line += (line.isEmpty()) ? appline : "
" + appline; //if (hrefline != null) line += (line.isEmpty()) ? hrefline : "
" + hrefline; - if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "
" + textline; + //if (textline != null) snippetLine += (snippetLine.isEmpty()) ? textline : "
" + textline; if (snippetLine == null || !remainingHashes.isEmpty()) { init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");