enhanced url constraint computation: better position of constraint check during retrieval process

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6272 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 3ebb228ea1
commit b0637600d5

@ -53,6 +53,12 @@ public final class QueryParams {
public static final int CONTENTDOM_VIDEO = 3;
public static final int CONTENTDOM_APP = 4;
public static enum FetchMode {
NO_FETCH_NO_VERIFY,
FETCH_BUT_ACCEPT_OFFLINE_OR_USE_CACHE,
FETCH_AND_VERIFY_ONLINE;
}
public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA");
public static final Bitfield catchall_constraint = new Bitfield(4, "______");

@ -328,47 +328,80 @@ public final class RankingProcess extends Thread {
if (((stack.size() == 0) && (size() == 0))) break;
final SortStack<WordReferenceVars>.stackElement obrwi = takeRWI(skipDoubleDom);
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) {
final URLMetadataRow.Components metadata = u.metadata();
// TODO: check url constraints
// evaluate information of metadata for navigation
// author navigation:
String author = metadata.dc_creator();
if (author != null && author.length() > 0) {
// add author to the author navigator
String authorhash = new String(Word.word2hash(author));
//System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author);
// check if we already are filtering for authors
if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
continue;
}
// add author to the author navigator
AuthorInfo in = this.authorNavigator.get(authorhash);
if (in == null) {
this.authorNavigator.put(authorhash, new AuthorInfo(author));
} else {
in.inc();
this.authorNavigator.put(authorhash, in);
}
} else if (this.query.authorhash != null) {
continue;
}
final URLMetadataRow page = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (page == null) {
misses.add(obrwi.element.metadataHash());
continue;
}
// prepare values for constraint check
final URLMetadataRow.Components metadata = page.metadata();
// check url constraints
if (metadata.url() == null) {
continue; // rare case where the url is corrupted
}
final String pageurl = metadata.url().toNormalform(true, true);
final String pageauthor = metadata.dc_creator();
final String pagetitle = metadata.dc_title().toLowerCase();
// check exclusion
if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
(QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) ||
(QueryParams.matches(pageauthor.toLowerCase(), query.excludeHashes))) {
continue;
}
// check url mask
if (!(pageurl.matches(query.urlMask))) {
continue;
}
// check index-of constraint
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(pagetitle.startsWith("index of")))) {
final Iterator<byte[]> wi = query.queryHashes.iterator();
while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
continue;
}
// check content domain
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO && page.laudio() == 0) ||
(query.contentdom == QueryParams.CONTENTDOM_VIDEO && page.lvideo() == 0) ||
(query.contentdom == QueryParams.CONTENTDOM_IMAGE && page.limage() == 0) ||
(query.contentdom == QueryParams.CONTENTDOM_APP && page.lapp() == 0)) {
continue;
}
// evaluate information of metadata for navigation
// author navigation:
if (pageauthor != null && pageauthor.length() > 0) {
// add author to the author navigator
String authorhash = new String(Word.word2hash(pageauthor));
//System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author);
// get the url
if (metadata.url() != null) {
String urlstring = metadata.url().toNormalform(true, true);
if (urlstring == null || !urlstring.matches(query.urlMask)) continue;
this.handover.add(u.hash()); // remember that we handed over this url
return u;
// check if we already are filtering for authors
if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) {
continue;
}
// add author to the author navigator
AuthorInfo in = this.authorNavigator.get(authorhash);
if (in == null) {
this.authorNavigator.put(authorhash, new AuthorInfo(pageauthor));
} else {
in.inc();
this.authorNavigator.put(authorhash, in);
}
} else if (this.query.authorhash != null) {
continue;
}
misses.add(obrwi.element.metadataHash());
// accept url
this.handover.add(page.hash()); // remember that we handed over this url
return page;
}
return null;
}

@ -29,7 +29,6 @@ package de.anomic.search;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.document.Condenser;
@ -66,7 +65,7 @@ public class ResultFetcher {
@SuppressWarnings("unchecked")
ResultFetcher(
public ResultFetcher(
RankingProcess rankedCache,
final QueryParams query,
final Segment indexSegment,
@ -112,123 +111,7 @@ public class ResultFetcher {
}
}
protected ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
// a search result entry needs some work to produce a result Entry:
// - check if url entry exists in LURL-db
// - check exclusions, constraints, masks, media-domains
// - load snippet (see if page exists) and check if snippet contains searched word
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System.currentTimeMillis();
final URLMetadataRow.Components metadata = page.metadata();
final String pagetitle = metadata.dc_title().toLowerCase();
if (metadata.url() == null) {
registerFailure(page.hash(), "url corrupted (null)");
return null; // rare case where the url is corrupted
}
final String pageurl = metadata.url().toString().toLowerCase();
final String pageauthor = metadata.dc_creator().toLowerCase();
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
// check exclusion
if ((QueryParams.matches(pagetitle, query.excludeHashes)) ||
(QueryParams.matches(pageurl, query.excludeHashes)) ||
(QueryParams.matches(pageauthor, query.excludeHashes))) {
return null;
}
// check url mask
if (!(pageurl.matches(query.urlMask))) {
return null;
}
// check constraints
if ((query.constraint != null) &&
(query.constraint.get(Condenser.flag_cat_indexof)) &&
(!(metadata.dc_title().startsWith("Index of")))) {
final Iterator<byte[]> wi = query.queryHashes.iterator();
while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {}
registerFailure(page.hash(), "index-of constraint not fullfilled");
return null;
}
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (page.laudio() == 0)) {
registerFailure(page.hash(), "contentdom-audio constraint not fullfilled");
return null;
}
if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) {
registerFailure(page.hash(), "contentdom-video constraint not fullfilled");
return null;
}
if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (page.limage() == 0)) {
registerFailure(page.hash(), "contentdom-image constraint not fullfilled");
return null;
}
if ((query.contentdom == QueryParams.CONTENTDOM_APP) && (page.lapp() == 0)) {
registerFailure(page.hash(), "contentdom-app constraint not fullfilled");
return null;
}
if (snippetFetchMode == 0) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
}
// load snippet
if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (snippetFetchMode == 1) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(snippet, query.id(false));
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
} else {
// attach media information
startTime = System.currentTimeMillis();
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (snippetFetchMode == 1) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
return null;
}
}
// finished, no more actions possible here
}
boolean anyWorkerAlive() {
boolean anyWorkerAlive() {
if (this.workerThreads == null) return false;
for (int i = 0; i < this.workerThreads.length; i++) {
if ((this.workerThreads[i] != null) &&
@ -281,7 +164,8 @@ public class ResultFetcher {
if (result.exists(page.hash().hashCode())) continue;
if (failedURLs.get(page.hash()) != null) continue;
final ResultEntry resultEntry = obtainResultEntry(page, snippetMode);
final ResultEntry resultEntry = fetchSnippet(page, snippetMode);
if (resultEntry == null) continue; // the entry had some problems, cannot be used
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
@ -305,6 +189,70 @@ public class ResultFetcher {
}
}
protected ResultEntry fetchSnippet(final URLMetadataRow page, final int snippetMode) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
// load only urls if there was not yet a root url of that hash
// find the url entry
long startTime = System.currentTimeMillis();
final URLMetadataRow.Components metadata = page.metadata();
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (snippetMode == 0) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet
}
// load snippet
if (query.contentdom == QueryParams.CONTENTDOM_TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (snippetMode == 1) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
if (!peers.mySeed().isVirgin())
try {
TextSnippet.failConsequences(snippet, query.id(false));
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
} else {
// attach media information
startTime = System.currentTimeMillis();
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) {
// found media snippets, return entry
return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (snippetMode == 1) {
return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no media snippet for URL " + metadata.url());
return null;
}
}
// finished, no more actions possible here
}
private void registerFailure(final String urlhash, final String reason) {
this.failedURLs.put(urlhash, reason);
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);

Loading…
Cancel
Save