From 9bece5ac5f7b9e289d7db26b88fa6d9b2328b16b Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 17 Aug 2012 14:22:07 +0200
Subject: [PATCH] enhanced snippet fetch - removed a bug that caused documents
 to be parsed even if a solr text was available

---
 .../kelondro/data/meta/URIMetadataNode.java   |  4 +
 .../net/yacy/search/query/SnippetProcess.java | 21 +++---
 .../net/yacy/search/snippet/TextSnippet.java  | 75 +++++++++++--------
 3 files changed, 58 insertions(+), 42 deletions(-)

diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
index 2c86756c0..25936eb5e 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@@ -99,6 +99,10 @@ public class URIMetadataNode implements URIMetadata {
         this.ranking = ranking;
     }
 
+    public SolrDocument getDocument() {
+        return this.doc;
+    }
+
     private int getInt(YaCySchema field) {
         Integer x = (Integer) this.doc.getFieldValue(field.name());
         if (x == null) return 0;
diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java
index 23fc76a01..4498ac730 100644
--- a/source/net/yacy/search/query/SnippetProcess.java
+++ b/source/net/yacy/search/query/SnippetProcess.java
@@ -35,7 +35,6 @@ import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.Classification;
 import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.cora.protocol.ResponseHeader;
-import net.yacy.cora.services.federated.solr.SolrConnector;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.cora.sorting.ScoreMap;
 import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
@@ -45,6 +44,7 @@ import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.document.Condenser;
 import net.yacy.kelondro.data.meta.URIMetadata;
+import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.RowHandleSet;
 import net.yacy.kelondro.logging.Log;
@@ -54,6 +54,7 @@ import net.yacy.peers.graphics.ProfilingGraph;
 import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.EventTracker;
 import net.yacy.search.Switchboard;
+import net.yacy.search.index.MetadataRepository;
 import net.yacy.search.index.Segment;
 import net.yacy.search.snippet.MediaSnippet;
 import net.yacy.search.snippet.ResultEntry;
@@ -439,7 +440,7 @@ public class SnippetProcess {
         private final CacheStrategy cacheStrategy;
         private final int neededResults;
         private boolean shallrun;
-        private final SolrConnector solr;
+        private final MetadataRepository metadata;
 
         public Worker(final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
             this.cacheStrategy = cacheStrategy;
@@ -447,7 +448,7 @@ public class SnippetProcess {
             this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
             this.neededResults = neededResults;
             this.shallrun = true;
-            this.solr = SnippetProcess.this.rankingProcess.getQuery().getSegment().getSolr();
+            this.metadata = SnippetProcess.this.rankingProcess.getQuery().getSegment().urlMetadata();
         }
 
         @Override
@@ -496,16 +497,18 @@ public class SnippetProcess {
 
                     // in case that we have an attached solr, we load also the solr document
                     String solrContent = null;
-                    if (this.solr != null) {
-                        SolrDocument sd = null;
+                    SolrDocument sd = null;
+                    if (page instanceof URIMetadataNode) {
+                        sd = ((URIMetadataNode) page).getDocument();
+                    } else {
                         try {
-                            sd = this.solr.get(ASCII.String(page.hash()));
+                            sd = this.metadata.getSolr().get(ASCII.String(page.hash()));
                         } catch (IOException e) {
                             Log.logException(e);
                         }
-                        if (sd != null) {
-                            solrContent = Switchboard.getSwitchboard().index.getSolrScheme().solrGetText(sd);
-                        }
+                    }
+                    if (sd != null) {
+                        solrContent = this.metadata.getSolrScheme().solrGetText(sd);
                     }
 
                     resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java
index 9f55a2cad..167316047 100644
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@@ -179,37 +179,24 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         { //encapsulate potential expensive sentences
             Collection<StringBuilder> sentences = null;
 
-            // try the solr text first
-            if (solrText != null) {
-                // compute sentences from solr query
-                final SentenceReader sr = new SentenceReader(solrText, pre);
-                sentences = new ArrayList<StringBuilder>();
-                while (sr.hasNext()) {
-                    sentences.add(sr.next());
-                }
-
-                if (sentences != null) {
-                    try {
-                        final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
-                        textline = tsr.getSnippet();
-                        remainingHashes =  tsr.getRemainingWords();
-                    } catch (final UnsupportedOperationException e) {
-                        init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
-                        return;
-                    }
-                }
-            }
-
             // try to get the snippet from metadata
             removeMatchingHashes(row.dc_title(), remainingHashes);
             removeMatchingHashes(row.dc_creator(), remainingHashes);
             removeMatchingHashes(row.dc_subject(), remainingHashes);
             removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
 
-            if (remainingHashes.isEmpty()) {
-                // the snippet is fully inside the metadata!
+            if (!remainingHashes.isEmpty()) {
+                // we did not find everything in the metadata, look further into the document itself.
 
-                if (de.anomic.crawler.Cache.has(url.hash())) {
+                // first acquire the sentences:
+                if (solrText != null) {
+                    // compute sentences from solr query
+                    final SentenceReader sr = new SentenceReader(solrText, pre);
+                    sentences = new ArrayList<StringBuilder>();
+                    while (sr.hasNext()) {
+                        sentences.add(sr.next());
+                    }
+                } else if (de.anomic.crawler.Cache.has(url.hash())) {
                     // get the sentences from the cache
                     final Request request = loader == null ? null : loader.request(url, true, reindexing);
                     Response response;
@@ -227,19 +214,41 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                         }
                     }
                 }
-
                 if (sentences == null) {
+                    // not found the snippet
                     init(url.hash(), null, ResultClass.SOURCE_METADATA, null);
                     return;
                 }
-                // use the first lines from the text as snippet
-                final StringBuilder s = new StringBuilder(snippetMaxLength);
-                for (final StringBuilder t: sentences) {
-                    s.append(t).append(' ');
-                    if (s.length() >= snippetMaxLength / 4 * 3) break;
+
+                if (sentences.size() > 0) {
+                    try {
+                        final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);
+                        textline = tsr.getSnippet();
+                        remainingHashes = tsr.getRemainingWords();
+                    } catch (final UnsupportedOperationException e) {
+                        init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");
+                        return;
+                    }
+                }
+           }
+
+           if (remainingHashes.isEmpty()) {
+                // we found the snippet
+                if (textline == null) {
+                    if (sentences == null) {
+                        textline = row.dc_subject();
+                    } else {
+                        // use the first lines from the text as snippet
+                        final StringBuilder s = new StringBuilder(snippetMaxLength);
+                        for (final StringBuilder t: sentences) {
+                        s.append(t).append(' ');
+                        if (s.length() >= snippetMaxLength / 4 * 3) break;
+                        }
+                        if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); }
+                        textline = s.toString();
+                    }
                 }
-                if (s.length() > snippetMaxLength) { s.setLength(snippetMaxLength); s.trimToSize(); }
-                init(url.hash(), s.length() > 0 ? s.toString() : this.line, ResultClass.SOURCE_METADATA, null);
+                init(url.hash(), textline.length() > 0 ? textline : this.line, ResultClass.SOURCE_METADATA, null);
                 return;
             }
 
@@ -313,7 +322,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
         //if (videoline != null) line += (line.isEmpty()) ? videoline : "<br />" + videoline;
         //if (appline   != null) line += (line.isEmpty()) ? appline   : "<br />" + appline;
         //if (hrefline  != null) line += (line.isEmpty()) ? hrefline  : "<br />" + hrefline;
-        if (textline  != null) snippetLine += (snippetLine.isEmpty()) ? textline  : "<br />" + textline;
+        //if (textline  != null) snippetLine += (snippetLine.isEmpty()) ? textline  : "<br />" + textline;
 
         if (snippetLine == null || !remainingHashes.isEmpty()) {
             init(url.hash(), null, ResultClass.ERROR_NO_MATCH, "no matching snippet found");