yacy_search_server/source/net/yacy/document/SnippetExtractor.java

/**
 *  SnippetExtractor
 *  Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
 *  First released 22.10.2010 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.document;

import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.TreeSet;

public class SnippetExtractor {

    private String snippetString;
    private Set<String> remainingTerms;

    
    public SnippetExtractor(final Iterable<StringBuilder> sentences, final Set<String> queryTerms, int maxLength) throws UnsupportedOperationException {
        if (sentences == null) throw new UnsupportedOperationException("sentences == null");
        if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
        SortedMap<String, Integer> hs;
        final TreeMap<Long, StringBuilder> order = new TreeMap<Long, StringBuilder>();
        long uniqCounter = 999L;
        Integer pos;
        TreeSet<Integer> positions;
        int linenumber = 0;
        int fullmatchcounter = 0;
        lookup: for(final StringBuilder sentence : sentences) {
            hs = WordTokenizer.tokenizeSentence(sentence.toString(), 100);
            positions = new TreeSet<Integer>();
            for (final String word: queryTerms) {
                pos = hs.get(word);
                if (pos != null) {
                    positions.add(pos);
                }
            }
            int worddistance = positions.size() > 1 ? positions.last() - positions.first() : 0;
            // sort by
            // - 1st order: number of matching words
            // - 2nd order: word distance
            // - 3th order: line length (not too short and not too long)
            // - 4rd order: line number
            if (!positions.isEmpty()) {
                order.put(Long.valueOf(-100000000L * (linenumber == 0 ? 1 : 0) + 10000000L * positions.size() + 1000000L * worddistance + 100000L * linelengthKey(sentence.length(), maxLength) - 10000L * linenumber + uniqCounter--), sentence);
                if (order.size() > 5) order.remove(order.firstEntry().getKey());
                if (positions.size() == queryTerms.size()) fullmatchcounter++;
                if (fullmatchcounter >= 3) break lookup;
            }
            linenumber++;
        }

        StringBuilder sentence;
        SnippetExtractor tsr;
        while (!order.isEmpty()) {
            sentence = order.remove(order.lastKey()); // sentence with the biggest score
            try {
                tsr = new SnippetExtractor(sentence.toString(), queryTerms, maxLength);
            } catch (final UnsupportedOperationException e) {
                continue;
            }
            this.snippetString = tsr.snippetString;
            if (this.snippetString != null && this.snippetString.length() > 0) {
                this.remainingTerms = tsr.remainingTerms;
                if (this.remainingTerms.isEmpty()) {
                    // we have found the snippet
                    return; // finished!
                } else if (this.remainingTerms.size() < queryTerms.size()) {
                    // the result has not all words in it.
                    // find another sentence that represents the missing other words
                    // and find recursively more sentences
                    maxLength = maxLength - this.snippetString.length();
                    if (maxLength < 20) maxLength = 20;
                    try {
                        tsr = new SnippetExtractor(order.values(), this.remainingTerms, maxLength);
                    } catch (final UnsupportedOperationException e) {
                        throw e;
                    }
                    final String nextSnippet = tsr.snippetString;
                    if (nextSnippet == null) return;
                    this.snippetString = this.snippetString + (" / " + nextSnippet);
                    this.remainingTerms = tsr.remainingTerms;
                    return;
                } else {
                    // error
                    //assert remaininghashes.size() < queryhashes.size() : "remaininghashes.size() = " + remaininghashes.size() + ", queryhashes.size() = " + queryhashes.size() + ", sentence = '" + sentence + "', result = '" + result + "'";
                    continue;
                }
            }
        }
        throw new UnsupportedOperationException("no snippet computed");
    }

    private static int linelengthKey(int givenlength, int maxlength) {
        if (givenlength > maxlength) return 1;
        if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
        if (givenlength >= maxlength / 4 && givenlength < maxlength / 2) return 5;
        if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
        return 0;
    }

    
    private SnippetExtractor(String sentence, final Set<String> queryTerms, final int maxLength) throws UnsupportedOperationException {
        try {
            if (sentence == null) throw new UnsupportedOperationException("no sentence given");
            if (queryTerms == null || queryTerms.isEmpty()) throw new UnsupportedOperationException("queryTerms == null");
            String term;

            // find all hashes that appear in the sentence
            final Map<String, Integer> hs = WordTokenizer.tokenizeSentence(sentence, 100);
            final Iterator<String> j = queryTerms.iterator();
            Integer pos;
            int p, minpos = sentence.length(), maxpos = -1;
            final Set<String> remainingTerms = new HashSet<>();
            while (j.hasNext()) {
                term = j.next();
                pos = hs.get(term);
                if (pos == null) {
                   remainingTerms.add(term);
                } else {
                    p = pos.intValue();
                    if (p > maxpos) maxpos = p;
                    if (p < minpos) minpos = p;
                }
            }
            // check result size
            maxpos = maxpos + 10;
            if (maxpos > sentence.length()) maxpos = sentence.length();
            if (minpos < 0) minpos = 0;
            // we have a result, but is it short enough?
            if (maxpos - minpos + 10 > maxLength) {
                // the string is too long, even if we cut at both ends
                // so cut here in the middle of the string
                final int lenb = sentence.length();
                sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
                " [..] " +
                sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
                maxpos = maxpos + lenb - sentence.length() + 6;
            } else if (maxpos > maxLength) {
                // the string is too long, even if we cut it at the end
                // so cut it here at both ends at once
                assert maxpos >= minpos;
                final int newlen = Math.max(10, maxpos - minpos + 10);
                assert maxLength >= newlen: "maxLength = " + maxLength + ", newlen = " + newlen;
                final int around = (maxLength - newlen) / 2;
                assert minpos - around < sentence.length() : "maxpos = " + maxpos + ", minpos = " + minpos + ", around = " + around + ", sentence.length() = " + sentence.length() + ", maxLength = " + maxLength + ", newlen = " + newlen; //maxpos = 435, minpos = 17, around = -124, sentence.length() = 44
                sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
                minpos = around;
                maxpos = sentence.length() - around - 5;
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 1st step (cut at right side)
                sentence = sentence.substring(0, Math.min(maxpos + 20, sentence.length())).trim() + " [..]";
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 2nd step (cut at left side)
                sentence = "[..] " + sentence.substring(Math.max(minpos - 20, 0)).trim();
            }
            if (sentence.length() > maxLength) {
                // trim sentence, 3rd step (cut in the middle)
                sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
            }
            this.snippetString = sentence;
            this.remainingTerms = remainingTerms;
        } catch (final IndexOutOfBoundsException e) {
            throw new UnsupportedOperationException(e.getMessage());
        }
    }

    public String getSnippet() {
        return this.snippetString;
    }

    public Set<String> getRemainingTerms() {
		return this.remainingTerms;
	}
}