You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
264 lines
14 KiB
264 lines
14 KiB
/**
|
|
* SnippetWorker
|
|
* Copyright 2012 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
|
|
* First released 01.11.2012 at http://yacy.net
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
* along with this program in the file lgpl21.txt
|
|
* If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package net.yacy.search.query;
|
|
|
|
import java.util.Iterator;
|
|
|
|
import net.yacy.cora.document.ASCII;
|
|
import net.yacy.cora.document.MultiProtocolURI;
|
|
import net.yacy.cora.document.analysis.Classification;
|
|
import net.yacy.cora.federate.yacy.CacheStrategy;
|
|
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|
import net.yacy.cora.sorting.ScoreMap;
|
|
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
|
|
import net.yacy.cora.storage.HandleSet;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import net.yacy.kelondro.util.MemoryControl;
|
|
import net.yacy.search.index.Segment;
|
|
import net.yacy.search.snippet.ResultEntry;
|
|
import net.yacy.search.snippet.TextSnippet;
|
|
import net.yacy.search.snippet.TextSnippet.ResultClass;
|
|
|
|
public class SnippetWorker extends Thread {
|
|
private final SearchEvent snippetProcess;
|
|
private final long timeout; // the date until this thread should try to work
|
|
private long lastLifeSign; // when the last time the run()-loop was executed
|
|
private final CacheStrategy cacheStrategy;
|
|
private final int neededResults;
|
|
private boolean shallrun;
|
|
|
|
protected SnippetWorker(final SearchEvent snippetProcess, final long maxlifetime, final CacheStrategy cacheStrategy, final int neededResults) {
|
|
this.snippetProcess = snippetProcess;
|
|
this.cacheStrategy = cacheStrategy;
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
|
|
this.neededResults = neededResults;
|
|
this.shallrun = true;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
|
|
// start fetching urls and snippets
|
|
URIMetadataNode page;
|
|
ResultEntry resultEntry;
|
|
try {
|
|
while (this.shallrun && System.currentTimeMillis() < this.timeout) {
|
|
this.lastLifeSign = System.currentTimeMillis();
|
|
|
|
if (MemoryControl.shortStatus()) {
|
|
Log.logWarning("SnippetProcess", "shortStatus");
|
|
break;
|
|
}
|
|
|
|
// check if we have enough; we stop only if we can fetch online; otherwise its better to run this to get better navigation
|
|
if ((this.cacheStrategy == null || this.cacheStrategy.isAllowedToFetchOnline()) && this.snippetProcess.result.sizeAvailable() >= this.neededResults) {
|
|
Log.logWarning("SnippetProcess", this.snippetProcess.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
|
|
break;
|
|
}
|
|
|
|
// check if we can succeed if we try to take another url
|
|
if (this.snippetProcess.rankingProcess.feedingIsFinished() && this.snippetProcess.rankingProcess.rwiQueueSize() == 0 && this.snippetProcess.nodeStack.sizeAvailable() == 0) {
|
|
Log.logWarning("SnippetProcess", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
|
|
break;
|
|
}
|
|
|
|
// get next entry
|
|
page = this.snippetProcess.takeURL(true, Math.min(500, Math.max(20, this.timeout - System.currentTimeMillis())));
|
|
//if (page != null) Log.logInfo("SnippetProcess", "got one page: " + page.metadata().url().toNormalform(true, false));
|
|
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
|
|
if (page == null) {
|
|
//Log.logWarning("SnippetProcess", "page == null");
|
|
break; // no more available
|
|
}
|
|
|
|
this.setName(page.url().toNormalform(true)); // to support debugging
|
|
if (this.snippetProcess.query.filterfailurls && this.snippetProcess.workTables.failURLsContains(page.hash())) {
|
|
continue;
|
|
}
|
|
|
|
resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
|
|
if (resultEntry == null) {
|
|
continue; // the entry had some problems, cannot be used
|
|
}
|
|
|
|
//if (result.contains(resultEntry)) continue;
|
|
this.snippetProcess.snippetComputationAllTime += resultEntry.snippetComputationTime;
|
|
|
|
// place the result to the result vector
|
|
// apply post-ranking
|
|
long ranking = resultEntry.word() == null ? 0 : Long.valueOf(this.snippetProcess.rankingProcess.order.cardinal(resultEntry.word()));
|
|
ranking += postRanking(resultEntry, new ConcurrentScoreMap<String>() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/);
|
|
resultEntry.ranking = ranking;
|
|
this.snippetProcess.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
|
|
this.snippetProcess.rankingProcess.addTopics(resultEntry);
|
|
}
|
|
if (System.currentTimeMillis() >= this.timeout) {
|
|
Log.logWarning("SnippetProcess", "worker ended with timeout");
|
|
}
|
|
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
|
|
} catch (final Exception e) { Log.logException(e); }
|
|
//Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
|
|
}
|
|
|
|
protected void pleaseStop() {
|
|
this.shallrun = false;
|
|
}
|
|
|
|
/**
|
|
* calculate the time since the worker has had the latest activity
|
|
* @return time in milliseconds lasted since latest activity
|
|
*/
|
|
protected long busytime() {
|
|
return System.currentTimeMillis() - this.lastLifeSign;
|
|
}
|
|
|
|
private long postRanking(
|
|
final ResultEntry rentry,
|
|
final ScoreMap<String> topwords) {
|
|
|
|
long r = 0;
|
|
|
|
// for media search: prefer pages with many links
|
|
r += rentry.limage() << this.snippetProcess.query.ranking.coeff_cathasimage;
|
|
r += rentry.laudio() << this.snippetProcess.query.ranking.coeff_cathasaudio;
|
|
r += rentry.lvideo() << this.snippetProcess.query.ranking.coeff_cathasvideo;
|
|
r += rentry.lapp() << this.snippetProcess.query.ranking.coeff_cathasapp;
|
|
|
|
// apply citation count
|
|
//System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother());
|
|
r += (128 * rentry.referencesCount() / (1 + 2 * rentry.llocal() + rentry.lother())) << this.snippetProcess.query.ranking.coeff_citation;
|
|
|
|
// prefer hit with 'prefer' pattern
|
|
if (this.snippetProcess.query.prefer.matcher(rentry.url().toNormalform(true)).matches()) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
|
|
}
|
|
if (this.snippetProcess.query.prefer.matcher(rentry.title()).matches()) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_prefer;
|
|
}
|
|
|
|
// apply 'common-sense' heuristic using references
|
|
final String urlstring = rentry.url().toNormalform(true);
|
|
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
|
|
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
|
|
int tc;
|
|
for (final String urlcomp : urlcomps) {
|
|
tc = topwords.get(urlcomp);
|
|
if (tc > 0) {
|
|
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_urlcompintoplist;
|
|
}
|
|
}
|
|
for (final String descrcomp : descrcomps) {
|
|
tc = topwords.get(descrcomp);
|
|
if (tc > 0) {
|
|
r += Math.max(1, tc) << this.snippetProcess.query.ranking.coeff_descrcompintoplist;
|
|
}
|
|
}
|
|
|
|
// apply query-in-result matching
|
|
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
|
|
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
|
|
final Iterator<byte[]> shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator();
|
|
byte[] queryhash;
|
|
while (shi.hasNext()) {
|
|
queryhash = shi.next();
|
|
if (urlcomph.has(queryhash)) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_appurl;
|
|
}
|
|
if (descrcomph.has(queryhash)) {
|
|
r += 256 << this.snippetProcess.query.ranking.coeff_app_dc_title;
|
|
}
|
|
}
|
|
|
|
return r;
|
|
}
|
|
|
|
private ResultEntry fetchSnippet(final URIMetadataNode page, final CacheStrategy cacheStrategy) {
|
|
// Snippet Fetching can has 3 modes:
|
|
// 0 - do not fetch snippets
|
|
// 1 - fetch snippets offline only
|
|
// 2 - online snippet fetch
|
|
|
|
// load only urls if there was not yet a root url of that hash
|
|
// find the url entry
|
|
|
|
String solrsnippet = this.snippetProcess.snippets.get(ASCII.String(page.hash()));
|
|
if (solrsnippet != null && solrsnippet.length() > 0) {
|
|
final TextSnippet snippet = new TextSnippet(page.hash(), solrsnippet, true, ResultClass.SOURCE_CACHE, "");
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0);
|
|
}
|
|
|
|
if (cacheStrategy == null) {
|
|
final TextSnippet snippet = new TextSnippet(
|
|
null,
|
|
page,
|
|
this.snippetProcess.snippetFetchWordHashes,
|
|
//this.query.queryString,
|
|
null,
|
|
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
SearchEvent.SNIPPET_MAX_LENGTH,
|
|
!this.snippetProcess.query.isLocal());
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, 0); // result without snippet
|
|
}
|
|
|
|
// load snippet
|
|
if (page.url().getContentDomain() == Classification.ContentDomain.TEXT || page.url().getContentDomain() == Classification.ContentDomain.ALL) {
|
|
// attach text snippet
|
|
long startTime = System.currentTimeMillis();
|
|
final TextSnippet snippet = new TextSnippet(
|
|
this.snippetProcess.loader,
|
|
page,
|
|
this.snippetProcess.snippetFetchWordHashes,
|
|
cacheStrategy,
|
|
((this.snippetProcess.query.constraint != null) && (this.snippetProcess.query.constraint.get(Condenser.flag_cat_indexof))),
|
|
180,
|
|
!this.snippetProcess.query.isLocal());
|
|
final long snippetComputationTime = System.currentTimeMillis() - startTime;
|
|
SearchEvent.log.logInfo("text snippet load time for " + page.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
|
|
|
|
if (!snippet.getErrorCode().fail()) {
|
|
// we loaded the file and found the snippet
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, snippet, null, snippetComputationTime); // result with snippet attached
|
|
} else if (cacheStrategy.mustBeOffline()) {
|
|
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
|
|
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, snippetComputationTime); // result without snippet
|
|
} else {
|
|
// problems with snippet fetch
|
|
if (this.snippetProcess.snippetFetchWordHashes.has(Segment.catchallHash)) {
|
|
// we accept that because the word cannot be on the page
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0);
|
|
}
|
|
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
|
|
if (this.snippetProcess.deleteIfSnippetFail) {
|
|
this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason);
|
|
}
|
|
SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason);
|
|
return null;
|
|
}
|
|
}
|
|
return new ResultEntry(page, this.snippetProcess.query.getSegment(), this.snippetProcess.peers, null, null, 0); // result without snippet
|
|
}
|
|
}
|