From b0637600d51fee5cca5b1ce137065da3f93fbb74 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 27 Aug 2009 20:20:07 +0000 Subject: [PATCH] enhanced url constraint computation: better position of constraint check during retrieval process git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6272 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/search/QueryParams.java | 6 + source/de/anomic/search/RankingProcess.java | 109 ++++++++---- source/de/anomic/search/ResultFetcher.java | 188 +++++++------------- 3 files changed, 145 insertions(+), 158 deletions(-) diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index f2a7abcee..dd1f45d8b 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -53,6 +53,12 @@ public final class QueryParams { public static final int CONTENTDOM_VIDEO = 3; public static final int CONTENTDOM_APP = 4; + public static enum FetchMode { + NO_FETCH_NO_VERIFY, + FETCH_BUT_ACCEPT_OFFLINE_OR_USE_CACHE, + FETCH_AND_VERIFY_ONLINE; + } + public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA"); public static final Bitfield catchall_constraint = new Bitfield(4, "______"); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 47af10229..00cb03a11 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -328,47 +328,80 @@ public final class RankingProcess extends Thread { if (((stack.size() == 0) && (size() == 0))) break; final SortStack.stackElement obrwi = takeRWI(skipDoubleDom); if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause? - final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); - if (u != null) { - final URLMetadataRow.Components metadata = u.metadata(); - - // TODO: check url constraints - - - // evaluate information of metadata for navigation - // author navigation: - String author = metadata.dc_creator(); - if (author != null && author.length() > 0) { - // add author to the author navigator - String authorhash = new String(Word.word2hash(author)); - //System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author); - - // check if we already are filtering for authors - if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) { - continue; - } - - // add author to the author navigator - AuthorInfo in = this.authorNavigator.get(authorhash); - if (in == null) { - this.authorNavigator.put(authorhash, new AuthorInfo(author)); - } else { - in.inc(); - this.authorNavigator.put(authorhash, in); - } - } else if (this.query.authorhash != null) { - continue; - } + final URLMetadataRow page = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); + if (page == null) { + misses.add(obrwi.element.metadataHash()); + continue; + } + + // prepare values for constraint check + final URLMetadataRow.Components metadata = page.metadata(); + + // check url constraints + if (metadata.url() == null) { + continue; // rare case where the url is corrupted + } + + final String pageurl = metadata.url().toNormalform(true, true); + final String pageauthor = metadata.dc_creator(); + final String pagetitle = metadata.dc_title().toLowerCase(); + + // check exclusion + if ((QueryParams.matches(pagetitle, query.excludeHashes)) || + (QueryParams.matches(pageurl.toLowerCase(), query.excludeHashes)) || + (QueryParams.matches(pageauthor.toLowerCase(), query.excludeHashes))) { + continue; + } + + // check url mask + if (!(pageurl.matches(query.urlMask))) { + continue; + } + + // check index-of constraint + if ((query.constraint != null) && + (query.constraint.get(Condenser.flag_cat_indexof)) && + (!(pagetitle.startsWith("index of")))) { + final Iterator wi = query.queryHashes.iterator(); + while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {} + continue; + } + + // check content domain + if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO && page.laudio() == 0) || + (query.contentdom == QueryParams.CONTENTDOM_VIDEO && page.lvideo() == 0) || + (query.contentdom == QueryParams.CONTENTDOM_IMAGE && page.limage() == 0) || + (query.contentdom == QueryParams.CONTENTDOM_APP && page.lapp() == 0)) { + continue; + } + + // evaluate information of metadata for navigation + // author navigation: + if (pageauthor != null && pageauthor.length() > 0) { + // add author to the author navigator + String authorhash = new String(Word.word2hash(pageauthor)); + //System.out.println("*** DEBUG authorhash = " + authorhash + ", query.authorhash = " + this.query.authorhash + ", author = " + author); - // get the url - if (metadata.url() != null) { - String urlstring = metadata.url().toNormalform(true, true); - if (urlstring == null || !urlstring.matches(query.urlMask)) continue; - this.handover.add(u.hash()); // remember that we handed over this url - return u; + // check if we already are filtering for authors + if (this.query.authorhash != null && !this.query.authorhash.equals(authorhash)) { + continue; + } + + // add author to the author navigator + AuthorInfo in = this.authorNavigator.get(authorhash); + if (in == null) { + this.authorNavigator.put(authorhash, new AuthorInfo(pageauthor)); + } else { + in.inc(); + this.authorNavigator.put(authorhash, in); } + } else if (this.query.authorhash != null) { + continue; } - misses.add(obrwi.element.metadataHash()); + + // accept url + this.handover.add(page.hash()); // remember that we handed over this url + return page; } return null; } diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 038feb0f5..a1dff8b41 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -29,7 +29,6 @@ package de.anomic.search; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; -import java.util.Iterator; import java.util.TreeSet; import de.anomic.document.Condenser; @@ -66,7 +65,7 @@ public class ResultFetcher { @SuppressWarnings("unchecked") - ResultFetcher( + public ResultFetcher( RankingProcess rankedCache, final QueryParams query, final Segment indexSegment, @@ -112,123 +111,7 @@ public class ResultFetcher { } } - protected ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) { - - // a search result entry needs some work to produce a result Entry: - // - check if url entry exists in LURL-db - // - check exclusions, constraints, masks, media-domains - // - load snippet (see if page exists) and check if snippet contains searched word - - // Snippet Fetching can has 3 modes: - // 0 - do not fetch snippets - // 1 - fetch snippets offline only - // 2 - online snippet fetch - - // load only urls if there was not yet a root url of that hash - // find the url entry - - long startTime = System.currentTimeMillis(); - final URLMetadataRow.Components metadata = page.metadata(); - final String pagetitle = metadata.dc_title().toLowerCase(); - if (metadata.url() == null) { - registerFailure(page.hash(), "url corrupted (null)"); - return null; // rare case where the url is corrupted - } - final String pageurl = metadata.url().toString().toLowerCase(); - final String pageauthor = metadata.dc_creator().toLowerCase(); - final long dbRetrievalTime = System.currentTimeMillis() - startTime; - - // check exclusion - if ((QueryParams.matches(pagetitle, query.excludeHashes)) || - (QueryParams.matches(pageurl, query.excludeHashes)) || - (QueryParams.matches(pageauthor, query.excludeHashes))) { - return null; - } - - // check url mask - if (!(pageurl.matches(query.urlMask))) { - return null; - } - - // check constraints - if ((query.constraint != null) && - (query.constraint.get(Condenser.flag_cat_indexof)) && - (!(metadata.dc_title().startsWith("Index of")))) { - final Iterator wi = query.queryHashes.iterator(); - while (wi.hasNext()) try { indexSegment.termIndex().remove(wi.next(), page.hash()); } catch (IOException e) {} - registerFailure(page.hash(), "index-of constraint not fullfilled"); - return null; - } - - if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (page.laudio() == 0)) { - registerFailure(page.hash(), "contentdom-audio constraint not fullfilled"); - return null; - } - if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (page.lvideo() == 0)) { - registerFailure(page.hash(), "contentdom-video constraint not fullfilled"); - return null; - } - if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (page.limage() == 0)) { - registerFailure(page.hash(), "contentdom-image constraint not fullfilled"); - return null; - } - if ((query.contentdom == QueryParams.CONTENTDOM_APP) && (page.lapp() == 0)) { - registerFailure(page.hash(), "contentdom-app constraint not fullfilled"); - return null; - } - - if (snippetFetchMode == 0) { - return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet - } - - // load snippet - if (query.contentdom == QueryParams.CONTENTDOM_TEXT) { - // attach text snippet - startTime = System.currentTimeMillis(); - final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetFetchMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetFetchMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); - final long snippetComputationTime = System.currentTimeMillis() - startTime; - Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); - - if (snippet.getErrorCode() < 11) { - // we loaded the file and found the snippet - return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached - } else if (snippetFetchMode == 1) { - // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result - // this may happen during a remote search, because snippet loading is omitted to retrieve results faster - return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet - } else { - // problems with snippet fetch - registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); - if (!peers.mySeed().isVirgin()) - try { - TextSnippet.failConsequences(snippet, query.id(false)); - } catch (IOException e) { - e.printStackTrace(); - } - return null; - } - } else { - // attach media information - startTime = System.currentTimeMillis(); - final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetFetchMode == 2), 6000, query.isGlobal()); - final long snippetComputationTime = System.currentTimeMillis() - startTime; - Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); - - if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) { - // found media snippets, return entry - return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime); - } else if (snippetFetchMode == 1) { - return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); - } else { - // problems with snippet fetch - registerFailure(page.hash(), "no media snippet for URL " + metadata.url()); - return null; - } - } - // finished, no more actions possible here - } - - boolean anyWorkerAlive() { + boolean anyWorkerAlive() { if (this.workerThreads == null) return false; for (int i = 0; i < this.workerThreads.length; i++) { if ((this.workerThreads[i] != null) && @@ -281,7 +164,8 @@ public class ResultFetcher { if (result.exists(page.hash().hashCode())) continue; if (failedURLs.get(page.hash()) != null) continue; - final ResultEntry resultEntry = obtainResultEntry(page, snippetMode); + final ResultEntry resultEntry = fetchSnippet(page, snippetMode); + if (resultEntry == null) continue; // the entry had some problems, cannot be used urlRetrievalAllTime += resultEntry.dbRetrievalTime; snippetComputationAllTime += resultEntry.snippetComputationTime; @@ -305,6 +189,70 @@ public class ResultFetcher { } } + protected ResultEntry fetchSnippet(final URLMetadataRow page, final int snippetMode) { + // Snippet Fetching can has 3 modes: + // 0 - do not fetch snippets + // 1 - fetch snippets offline only + // 2 - online snippet fetch + + // load only urls if there was not yet a root url of that hash + // find the url entry + + long startTime = System.currentTimeMillis(); + final URLMetadataRow.Components metadata = page.metadata(); + final long dbRetrievalTime = System.currentTimeMillis() - startTime; + + if (snippetMode == 0) { + return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, 0); // result without snippet + } + + // load snippet + if (query.contentdom == QueryParams.CONTENTDOM_TEXT) { + // attach text snippet + startTime = System.currentTimeMillis(); + final TextSnippet snippet = TextSnippet.retrieveTextSnippet(metadata, snippetFetchWordHashes, (snippetMode == 2), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, (snippetMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); + final long snippetComputationTime = System.currentTimeMillis() - startTime; + Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); + + if (snippet.getErrorCode() < 11) { + // we loaded the file and found the snippet + return new ResultEntry(page, indexSegment, peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached + } else if (snippetMode == 1) { + // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result + // this may happen during a remote search, because snippet loading is omitted to retrieve results faster + return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet + } else { + // problems with snippet fetch + registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); + if (!peers.mySeed().isVirgin()) + try { + TextSnippet.failConsequences(snippet, query.id(false)); + } catch (IOException e) { + e.printStackTrace(); + } + return null; + } + } else { + // attach media information + startTime = System.currentTimeMillis(); + final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal()); + final long snippetComputationTime = System.currentTimeMillis() - startTime; + Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); + + if ((mediaSnippets != null) && (mediaSnippets.size() > 0)) { + // found media snippets, return entry + return new ResultEntry(page, indexSegment, peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime); + } else if (snippetMode == 1) { + return new ResultEntry(page, indexSegment, peers, null, null, dbRetrievalTime, snippetComputationTime); + } else { + // problems with snippet fetch + registerFailure(page.hash(), "no media snippet for URL " + metadata.url()); + return null; + } + } + // finished, no more actions possible here + } + private void registerFailure(final String urlhash, final String reason) { this.failedURLs.put(urlhash, reason); Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);