snippet computation update

11 years ago · 487021fb0a
parent cf9e7fdbb8
commit 487021fb0a
2 changed files with 177 additions and 114 deletions
--- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java
@ -45,6 +45,7 @@ import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.Condenser;
 import net.yacy.document.SentenceReader;
 import net.yacy.kelondro.data.word.WordReferenceRow;
 import net.yacy.kelondro.data.word.WordReferenceVars;
 import net.yacy.kelondro.util.Bitfield;
@ -211,6 +212,42 @@ public class URIMetadataNode extends SolrDocument {
        return a.get(0);
    }
    public List<String> h1() {
        ArrayList<String> a = getStringList(CollectionSchema.h1_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public List<String> h2() {
        ArrayList<String> a = getStringList(CollectionSchema.h2_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public List<String> h3() {
        ArrayList<String> a = getStringList(CollectionSchema.h3_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public List<String> h4() {
        ArrayList<String> a = getStringList(CollectionSchema.h4_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public List<String> h5() {
        ArrayList<String> a = getStringList(CollectionSchema.h5_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public List<String> h6() {
        ArrayList<String> a = getStringList(CollectionSchema.h6_txt);
        if (a == null || a.size() == 0) return new ArrayList<String>(0);
        return a;
    }
    public String dc_creator() {
        return getString(CollectionSchema.author);
    }
@ -418,6 +455,18 @@ public class URIMetadataNode extends SolrDocument {
        return getString(CollectionSchema.text_t);
    }
    public List<StringBuilder> getSentences(final boolean pre) {
        List<StringBuilder> sentences = new ArrayList<>();
        String text = this.getText();
        if (text == null || text.length() == 0) return sentences;
        SentenceReader sr = new SentenceReader(text, pre);
        while (sr.hasNext()) sentences.add(sr.next());
        sr.close();
        sr = null;
        text = null;
        return sentences;
    }
    public ArrayList<String> getDescription() {
        return getStringList(CollectionSchema.description_txt);
    }    
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@ -30,6 +30,7 @@ import java.util.Collection;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Pattern;
@ -178,8 +179,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        // this requires that the document is parsed after loading
        String textline = null;
        HandleSet remainingHashes = queryhashes.clone();
-        { //encapsulate potential expensive sentences
+        List<StringBuilder> sentences = null;
            Collection<StringBuilder> sentences = null;
        // try to get the snippet from metadata
        removeMatchingHashes(row.url().toTokens(), remainingHashes);
@ -192,16 +192,9 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            // first acquire the sentences:
            String solrText = row.getText();
-                if (solrText != null) {
+            if (solrText != null && solrText.length() > 0) {
                // compute sentences from solr query
-                    SentenceReader sr = new SentenceReader(solrText, pre);
+                sentences = row.getSentences(pre);
                    sentences = new ArrayList<StringBuilder>();
                    while (sr.hasNext()) {
                        sentences.add(sr.next());
                    }
                    sr.close();
                    sr = null;
                    solrText = null;
            } else if (net.yacy.crawler.data.Cache.has(url.hash())) {
                // get the sentences from the cache
                final Request request = loader == null ? null : loader.request(url, true, reindexing);
@ -241,12 +234,33 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
       }
       if (remainingHashes.isEmpty()) {
-                // we found the snippet
+            // we found the snippet or the query is fully included in the headline or url
-                if (textline == null) {
+            if (textline == null || textline.length() == 0) {
-                    if (sentences == null) {
+                // this is the case where we don't have a snippet because all search words are included in the headline or the url
                String solrText = row.getText();
                if (solrText != null && solrText.length() > 0) {
                    // compute sentences from solr query
                    sentences = row.getSentences(pre);
                }
                if (sentences == null || sentences.size() == 0) {
                    textline = row.dc_subject();
                } else {
-                        // use the first lines from the text as snippet
+                    // use the first lines from the text after the h1 tag as snippet
                    // get first the h1 tag
                    List<String> h1 = row.h1();
                    if (h1 != null && h1.size() > 0 && sentences.size() > 2) {
                        // find first appearance of first h1 in sencences and then take the next sentence
                        String h1s = h1.get(0);
                        if (h1s.length() > 0) {
                            solrsearch: for (int i = 0; i < sentences.size() - 2; i++) {
                                if (sentences.get(i).toString().startsWith(h1s)) {
                                    textline = sentences.get(i + 1).toString();
                                    break solrsearch;
                                }
                            }
                        }
                    }
                    if (textline == null) {
                        final StringBuilder s = new StringBuilder(snippetMaxLength);
                        for (final StringBuilder t: sentences) {
                        s.append(t).append(' ');
@ -256,6 +270,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                        textline = s.toString();
                    }
                }
            }
            init(url.hash(), textline.length() > 0 ? textline : this.line, false, ResultClass.SOURCE_METADATA, null);
            return;
        }
@ -318,7 +333,6 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            return;
        }
        sentences = null;
        } //encapsulate potential expensive sentences END
        if (textline == null || !remainingHashes.isEmpty()) {
            init(url.hash(), null, false, ResultClass.ERROR_NO_MATCH, "no matching snippet found");