/** * SnippetWorker * Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany * First released 01.11.2012 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ package net.yacy.search.query; import java.util.Iterator; import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement; import net.yacy.cora.storage.HandleSet; import net.yacy.document.Condenser; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.MemoryControl; import net.yacy.search.index.Segment; import net.yacy.search.snippet.ResultEntry; import net.yacy.search.snippet.TextSnippet; public class SnippetWorker extends Thread { private final SearchEvent snippetProcess; private final long timeout; // the date until this thread should try to work private long lastLifeSign; // when the last time the run()-loop was executed private final CacheStrategy cacheStrategy; private final int neededResults; private boolean shallrun; public SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) { this.snippetProcess = snippetProcess; this.cacheStrategy = cacheStrategy; this.lastLifeSign = System.currentTimeMillis(); this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.neededResults = neededResults; this.shallrun = true; } @Override public void run() { // start fetching urls and snippets URIMetadataNode page; ResultEntry resultEntry; //final int fetchAhead = snippetMode == 0 ? 0 : 10; final boolean nav_topics = this.snippetProcess.query.navigators.equals("all") || this.snippetProcess.query.navigators.indexOf("topics",0) >= 0; try { while (this.shallrun && System.currentTimeMillis() < this.timeout) { this.lastLifeSign = System.currentTimeMillis(); if (MemoryControl.shortStatus()) { Log.logWarning("SnippetProcess", "shortStatus"); break; } // check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) { Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults); break; } // check if we can succeed if we try to take another url if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) { Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0"); break; } // get next entry page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis()))); //if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false)); //if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis()); if (page == null) { Log.logWarning("SnippetProcess", "page == null"); break; // no more available } this.setName(page.url().toNormalform(true)); // to support debugging if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) { continue; } // in case that we have an attached solr, we load also the solr document String solrContent = page.getText(); resultEntry = fetchSnippet(page, solrContent, this.cacheStrategy); // does not fetch snippets if snippetMode == 0 if (resultEntry == null) { continue; // the entry had some problems, cannot be used } //if (result.contains(resultEntry)) continue; this.snippetProcess.urlRetrievalAllTime += resultEntry.dbRetrievalTime; this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime; // place the result to the result vector // apply post-ranking long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word())); ranking += postRanking(resultEntry, this.snippetProcess.rankingProcess.getTopicNavigator(10)); resultEntry.ranking = ranking; this.snippetProcess.result.put(new ReverseElement(resultEntry, ranking)); // remove smallest in case of overflow if (nav_topics) { this.snippetProcess.rankingProcess.addTopics(resultEntry); } } if (System.currentTimeMillis() >= this.timeout) { Log.logWarning("SnippetProcess", "worker ended with timeout"); } //System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops); } catch (final Exception e) { Log.logException(e); } //Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated"); } protected void pleaseStop() { this.shallrun = false; } /** * calculate the time since the worker has had the latest activity * @return time in milliseconds lasted since latest activity */ protected long busytime() { return System.currentTimeMillis() - this.lastLifeSign; } private long postRanking( final ResultEntry rentry, final ScoreMap topwords) { long r = 0; // for media search: prefer pages with many links r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage; r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio; r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo; r += rentry.lapp() << this.snippetProcess.query.ranking.coeff_cathasapp; // apply citation count //System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother()); r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation; // prefer hit with 'prefer' pattern if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) { r += 256 << this.snippetProcess.query.ranking.coeff_prefer; } if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) { r += 256 << this.snippetProcess.query.ranking.coeff_prefer; } // apply 'common-sense' heuristic using references final String urlstring = rentry.url().toNormalform(true); final String[] urlcomps = MultiProtocolURI.urlComps(urlstring); final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase()); int tc; for (final String urlcomp : urlcomps) { tc = topwords.get(urlcomp); if (tc > 0) { r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist; } } for (final String descrcomp : descrcomps) { tc = topwords.get(descrcomp); if (tc > 0) { r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist; } } // apply query-in-result matching final HandleSet urlcomph = Word.words2hashesHandles(urlcomps); final HandleSet descrcomph = Word.words2hashesHandles(descrcomps); final Iterator shi = this.snippetProcess.query.query_include_hashes.iterator(); byte[] queryhash; while (shi.hasNext()) { queryhash = shi.next(); if (urlcomph.has(queryhash)) { r += 256 << this.snippetProcess.query.ranking.coeff_appurl; } if (descrcomph.has(queryhash)) { r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title; } } return r; } private ResultEntry fetchSnippet(final URIMetadataNode page, final String solrText, final CacheStrategy cacheStrategy) { // Snippet Fetching can has 3 modes: // 0 - do not fetch snippets // 1 - fetch snippets offline only // 2 - online snippet fetch // load only urls if there was not yet a root url of that hash // find the url entry long startTime = System.currentTimeMillis(); if (page == null) { return null; } final long dbRetrievalTime = System.currentTimeMillis() - startTime; if (cacheStrategy == null) { final TextSnippet snippet = new TextSnippet( null, solrText, page, this.snippetProcess.snippetFetchWordHashes, //this.query.queryString, null, ((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))), SearchEvent.SNIPPET_MAX_LENGTH, !this.snippetProcess.query.isLocal()); return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, 0); // result without snippet } // load snippet if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) { // attach text snippet startTime = System.currentTimeMillis(); final TextSnippet snippet = new TextSnippet( this.snippetProcess.loader, solrText, page, this.snippetProcess.snippetFetchWordHashes, cacheStrategy, ((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))), 180, !this.snippetProcess.query.isLocal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); if (!snippet.getErrorCode().fail()) { // we loaded the file and found the snippet return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached } else if (cacheStrategy.mustBeOffline()) { // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result // this may happen during a remote search, because snippet loading is omitted to retrieve results faster return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet } else { // problems with snippet fetch if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) { // we accept that because the word cannot be on the page return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0); } final String reason = "no text snippet; errorCode = " + snippet.getErrorCode(); if (this.snippetProcess.deleteIfSnippetFail) { this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.query_include_hashes, reason); } SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason); return null; } } return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, dbRetrievalTime, 0); // result without snippet } }