yacy_search_server/source/de/anomic/search/ResultFetcher.java

// SearchEvent.java
// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.10.2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
// 
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package de.anomic.search;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;

import net.yacy.document.Condenser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.SortStore;

import de.anomic.search.RankingProcess.NavigatorEntry;
import de.anomic.search.MediaSnippet;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph;

public class ResultFetcher {
    
    // input values
    final RankingProcess  rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
    QueryParams     query;
    private final yacySeedDB      peers;
    
    // result values
    protected       Worker[]                workerThreads;
    protected final SortStore<ResultEntry>  result;
    protected final SortStore<MediaSnippet> images; // container to sort images by size
    protected final HashMap<String, String> failedURLs; // a mapping from a urlhash to a fail reason string
    protected final TreeSet<byte[]>         snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
    long urlRetrievalAllTime;
    long snippetComputationAllTime;
    int taketimeout;
    
    @SuppressWarnings("unchecked")
    public ResultFetcher(
            RankingProcess rankedCache,
            final QueryParams query,
            final yacySeedDB peers,
            final int taketimeout) {
    	
    	this.rankedCache = rankedCache;
    	this.query = query;
        this.peers = peers;
        this.taketimeout = taketimeout;
        
        this.urlRetrievalAllTime = 0;
        this.snippetComputationAllTime = 0;
        this.result = new SortStore<ResultEntry>(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking
        this.images = new SortStore<MediaSnippet>(-1, true);
        this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
        
        // snippets do not need to match with the complete query hashes,
        // only with the query minus the stopwords which had not been used for the search
        final TreeSet<byte[]> filtered = SetTools.joinConstructive(query.queryHashes, Switchboard.stopwordHashes);
        this.snippetFetchWordHashes = (TreeSet<byte[]>) query.queryHashes.clone();
        if (filtered != null && !filtered.isEmpty()) {
            SetTools.excludeDestructive(this.snippetFetchWordHashes, Switchboard.stopwordHashes);
        }
        
        // start worker threads to fetch urls and snippets
        this.workerThreads = null;
        deployWorker(Math.min(10, query.itemsPerPage), query.neededResults());
        EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), this.workerThreads.length + " online snippet fetch threads started", 0, 0), false, 30000, ProfilingGraph.maxTime);
    }

    public void deployWorker(int deployCount, int neededResults) {
    	if (anyWorkerAlive()) return;
    	this.workerThreads = new Worker[(query.onlineSnippetFetch) ? deployCount : 1];
    	for (int i = 0; i < workerThreads.length; i++) {
    		this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0, neededResults);
    		this.workerThreads[i].start();
        }
    }
    
   boolean anyWorkerAlive() {
        if (this.workerThreads == null) return false;
        for (int i = 0; i < this.workerThreads.length; i++) {
           if ((this.workerThreads[i] != null) &&
               (this.workerThreads[i].isAlive()) &&
               (this.workerThreads[i].busytime() < 3000)) return true;
        }
        return false;
    }
    
    public long getURLRetrievalTime() {
        return this.urlRetrievalAllTime;
    }
    
    public long getSnippetComputationTime() {
        return this.snippetComputationAllTime;
    }

    protected class Worker extends Thread {
        
        private final long timeout; // the date until this thread should try to work
        private long lastLifeSign; // when the last time the run()-loop was executed
        private final int id;
        private int snippetMode;
        private int neededResults;
        
        public Worker(final int id, final long maxlifetime, int snippetMode, int neededResults) {
            this.id = id;
            this.snippetMode = snippetMode;
            this.lastLifeSign = System.currentTimeMillis();
            this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
            this.neededResults = neededResults;
        }

        public void run() {

            // start fetching urls and snippets
            URIMetadataRow page;
            //final int fetchAhead = snippetMode == 0 ? 0 : 10;
            boolean nav_topics = query.navigators.equals("all") || query.navigators.indexOf("topics") >= 0;
            try {
                while (System.currentTimeMillis() < this.timeout) {
                	if (result.size() > neededResults) break;
                    this.lastLifeSign = System.currentTimeMillis();
    
                    // check if we have enough
                    if ((query.contentdom == ContentDomain.IMAGE) && (images.size() >= query.neededResults() + 50)) break;
                    if ((query.contentdom != ContentDomain.IMAGE) && (result.size() >= query.neededResults() + 10)) break;
    
                    // get next entry
                    page = rankedCache.takeURL(true, taketimeout);
                    if (page == null) break;
                    if (failedURLs.get(page.hash()) != null) continue;
                    
                    final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0

                    if (resultEntry == null) continue; // the entry had some problems, cannot be used
                    if (result.exists(resultEntry)) continue;
                    
                    urlRetrievalAllTime += resultEntry.dbRetrievalTime;
                    snippetComputationAllTime += resultEntry.snippetComputationTime;
                    //System.out.println("+++DEBUG-resultWorker+++ fetched " + resultEntry.urlstring());
                    
                    // place the result to the result vector
                    // apply post-ranking
                    long ranking = Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()));
                    ranking += postRanking(resultEntry, rankedCache.getTopics());
                    //System.out.println("*** resultEntry.hash = " + resultEntry.hash());
                    result.push(resultEntry, ranking);
                    if (nav_topics) rankedCache.addTopics(resultEntry);
                    //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
                }
            } catch (final Exception e) {
                Log.logException(e);
            }
            Log.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
        }
        
        public long busytime() {
            return System.currentTimeMillis() - this.lastLifeSign;
        }
    }
    
    protected ResultEntry fetchSnippet(final URIMetadataRow page, final int snippetMode) {
        // Snippet Fetching can has 3 modes:
        // 0 - do not fetch snippets
        // 1 - fetch snippets offline only
        // 2 - online snippet fetch
        
        // load only urls if there was not yet a root url of that hash
        // find the url entry

        long startTime = System.currentTimeMillis();
        final URIMetadataRow.Components metadata = page.metadata();
        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
        
        if (snippetMode == 0) {
            return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
        }
        
        // load snippet
        if (query.contentdom == ContentDomain.TEXT) {
            // attach text snippet
            startTime = System.currentTimeMillis();
            final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
            final long snippetComputationTime = System.currentTimeMillis() - startTime;
            Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
            
            if (snippet.getErrorCode() < 11) {
                // we loaded the file and found the snippet
                return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
            } else if (snippetMode == 1) {
                // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
                // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
            } else {
                // problems with snippet fetch
                registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
                if (!peers.mySeed().isVirgin())
                    try {
                        TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false));
                    } catch (IOException e) {
                        Log.logException(e);
                    }
                return null;
            }
        } else {
            // attach media information
            startTime = System.currentTimeMillis();
            final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
            final long snippetComputationTime = System.currentTimeMillis() - startTime;
            Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
            
            if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
                // found media snippets, return entry
                return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
            } else if (snippetMode == 1) {
                return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
            } else {
                // problems with snippet fetch
                registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
                return null;
            }
        }
        // finished, no more actions possible here
    }
    
    private void registerFailure(final String urlhash, final String reason) {
        this.failedURLs.put(urlhash, reason);
        Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
    }
    
    public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
        return this.rankedCache.getHostNavigator(maxentries);
    }
    
    public ArrayList<NavigatorEntry> getTopicNavigator(final int maxentries) {
        // returns a set of words that are computed as toplist
        return this.rankedCache.getTopicNavigator(maxentries);
    }
    
    public ArrayList<NavigatorEntry> getAuthorNavigator(final int maxentries) {
        // returns a list of authors so far seen on result set
        return this.rankedCache.getAuthorNavigator(maxentries);
    }
    
    public int resultCount() {
    	return this.result.size();
    }
    
    public ResultEntry oneResult(final int item) {
        // check if we already retrieved this item
    	// (happens if a search pages is accessed a second time)
        EventTracker.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), "obtain one result entry - start", 0, 0), false, 30000, ProfilingGraph.maxTime);
        if (this.result.sizeStore() > item) {
            // we have the wanted result already in the result array .. return that
            return this.result.element(item).element;
        }
        /*
        System.out.println("rankedCache.size() = " + this.rankedCache.size());
        System.out.println("result.size() = " + this.result.size());
        System.out.println("query.neededResults() = " + query.neededResults());
        */
        if ((!anyWorkerAlive()) &&
            (((query.contentdom == ContentDomain.IMAGE) && (images.size() + 30 < query.neededResults())) ||
             (this.result.size() < query.neededResults())) &&
            //(event.query.onlineSnippetFetch) &&
            (this.rankedCache.size() > this.result.size())
           ) {
        	// start worker threads to fetch urls and snippets
            deployWorker(Math.min(10, query.itemsPerPage), query.neededResults());
        }

        // finally wait until enough results are there produced from the
        // snippet fetch process
        while ((anyWorkerAlive()) && (result.size() <= item)) {
            try {Thread.sleep((item % query.itemsPerPage) * 50L);} catch (final InterruptedException e) {}
        }

        // finally, if there is something, return the result
        if (this.result.size() <= item) return null;
        return this.result.element(item).element;
    }
    
    private int resultCounter = 0;
    public ResultEntry nextResult() {
        final ResultEntry re = oneResult(resultCounter);
        resultCounter++;
        return re;
    }
    
    public MediaSnippet oneImage(final int item) {
        // check if we already retrieved this item (happens if a search pages is accessed a second time)
        if (this.images.sizeStore() > item) {
            // we have the wanted result already in the result array .. return that
            return this.images.element(item).element;
        }
        
        // generate result object
        final ResultEntry result = nextResult();
        MediaSnippet ms;
        if (result != null) {
            // iterate over all images in the result
            final ArrayList<MediaSnippet> imagemedia = result.mediaSnippets();
            if (imagemedia != null) {
                for (int j = 0; j < imagemedia.size(); j++) {
                    ms = imagemedia.get(j);
                    images.push(ms, Long.valueOf(ms.ranking));
                    //System.out.println("*** image " + ms.href.hash() + " images.size = " + images.size() + "/" + images.size());
                }
            }
        }
        
        // now take the specific item from the image stack
        if (this.images.size() <= item) return null;
        return this.images.element(item).element;
    }
    
    public ArrayList<SortStack<ResultEntry>.stackElement> completeResults(final long waitingtime) {
        final long timeout = System.currentTimeMillis() + waitingtime;
        while ((result.size() < query.neededResults()) && (anyWorkerAlive()) && (System.currentTimeMillis() < timeout)) {
            try {Thread.sleep(100);} catch (final InterruptedException e) {}
            //System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
        }
        return this.result.list(this.result.size());
    }

    public long postRanking(
            final ResultEntry rentry,
            final Map<String, Integer> topwords) {

        long r = 0;
        
        // for media search: prefer pages with many links
        if (query.contentdom == ContentDomain.IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
        if (query.contentdom == ContentDomain.AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
        if (query.contentdom == ContentDomain.VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
        if (query.contentdom == ContentDomain.APP  ) r += rentry.lapp()   << query.ranking.coeff_cathasapp;
        
        // prefer hit with 'prefer' pattern
        if (rentry.url().toNormalform(true, true).matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
        if (rentry.title().matches(query.prefer)) r += 256 << query.ranking.coeff_prefer;
        
        // apply 'common-sense' heuristic using references
        final String urlstring = rentry.url().toNormalform(true, true);
        final String[] urlcomps = DigestURI.urlComps(urlstring);
        final String[] descrcomps = DigestURI.splitpattern.split(rentry.title().toLowerCase());
        Integer tc;
        for (int j = 0; j < urlcomps.length; j++) {
            tc = topwords.get(urlcomps[j]);
            if (tc != null) r += Math.max(1, tc.intValue()) << query.ranking.coeff_urlcompintoplist;
        }
        for (int j = 0; j < descrcomps.length; j++) {
            tc = topwords.get(descrcomps[j]);
            if (tc != null) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist;
        }
        
        // apply query-in-result matching
        final Set<byte[]> urlcomph = Word.words2hashSet(urlcomps);
        final Set<byte[]> descrcomph = Word.words2hashSet(descrcomps);
        final Iterator<byte[]> shi = query.queryHashes.iterator();
        byte[] queryhash;
        while (shi.hasNext()) {
            queryhash = shi.next();
            if (urlcomph.contains(queryhash)) r += 256 << query.ranking.coeff_appurl;
            if (descrcomph.contains(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
        }
        
        return r;
    }

}