From 61748285c39f98d9aaf64e6b886f203229b2c2a3 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 27 Aug 2009 15:19:48 +0000 Subject: [PATCH] more refactoring of search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6270 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControlRWIs_p.java | 4 +- source/de/anomic/search/RankingProcess.java | 76 +++++++++---------- ...SnippetFetcher.java => ResultFetcher.java} | 8 +- source/de/anomic/search/SearchEvent.java | 8 +- source/de/anomic/yacy/yacyClient.java | 2 +- 5 files changed, 48 insertions(+), 50 deletions(-) rename source/de/anomic/search/{SnippetFetcher.java => ResultFetcher.java} (98%) diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 4b2b5afca..4ba7d9b17 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -366,7 +366,7 @@ public class IndexControlRWIs_p { URLMetadataRow entry; String us; long rn = -1; - while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { + while ((ranked.size() > 0) && ((entry = ranked.takeURL(false)) != null)) { if ((entry == null) || (entry.metadata() == null)) continue; url = entry.metadata().url(); if (url == null) continue; @@ -480,7 +480,7 @@ public class IndexControlRWIs_p { public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, final byte[] keyhash, final Bitfield filter) { final QueryParams query = new QueryParams(new String(keyhash), -1, sb.getRanking(), filter); final RankingProcess ranked = new RankingProcess(sb.indexSegment, query, Integer.MAX_VALUE, 1); - ranked.execQuery(); + ranked.run(); if (ranked.filteredCount() == 0) { prop.put("searchresult", 2); diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 1722811bb..47af10229 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -65,19 +65,21 @@ public final class RankingProcess extends Thread { private static boolean useYBR = true; private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000; - private final SortStack stack; - private final HashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack - private final HashSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process + private final Segment indexSegment; private final QueryParams query; private final int maxentries; - private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private final ReferenceOrder order; private final ConcurrentHashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final int[] flagcount; // flag counter private final TreeSet misses; // contains url-hashes that could not been found in the LURL-DB - private final Segment indexSegment; - private HashMap> localSearchInclusion; private final int[] domZones; + private HashMap> localSearchInclusion; + + private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; + private final SortStack stack; + private final HashMap> doubleDomCache; // key = domhash (6 bytes); value = like stack + private final HashSet handover; // key = urlhash; used for double-check of urls that had been handed over to search process + private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic private final ConcurrentHashMap hostNavigator; private final ConcurrentHashMap authorNavigator; @@ -114,12 +116,26 @@ public final class RankingProcess extends Thread { } public void run() { - // do a search concurrently + // do a search // sort the local containers and truncate it to a limited count, // so following sortings together with the global results will be fast try { - execQuery(); + long timer = System.currentTimeMillis(); + final TermSearch search = this.indexSegment.termIndex().query( + query.queryHashes, + query.excludeHashes, + null, + Segment.wordReferenceFactory, + query.maxDistance); + this.localSearchInclusion = search.inclusion(); + final ReferenceContainer index = search.joined(); + serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false); + if (index.size() == 0) { + return; + } + + add(index, true, index.size()); } catch (final Exception e) { e.printStackTrace(); } @@ -133,26 +149,7 @@ public final class RankingProcess extends Thread { return this.domZones; } - public void execQuery() { - - long timer = System.currentTimeMillis(); - final TermSearch search = this.indexSegment.termIndex().query( - query.queryHashes, - query.excludeHashes, - null, - Segment.wordReferenceFactory, - query.maxDistance); - this.localSearchInclusion = search.inclusion(); - final ReferenceContainer index = search.joined(); - serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false); - if (index.size() == 0) { - return; - } - - insertRanked(index, true, index.size()); - } - - public void insertRanked(final ReferenceContainer index, final boolean local, final int fullResource) { + public void add(final ReferenceContainer index, final boolean local, final int fullResource) { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime @@ -173,14 +170,11 @@ public final class RankingProcess extends Thread { // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - final Iterator i = decodedEntries.iterator(); - WordReferenceVars iEntry; Long r; HostInfo hs; String domhash; boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; - while (i.hasNext()) { - iEntry = i.next(); + for (WordReferenceVars iEntry: decodedEntries) { assert (iEntry.metadataHash().length() == index.row().primaryKeyLength); //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; @@ -282,7 +276,7 @@ public final class RankingProcess extends Thread { // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name - private SortStack.stackElement bestRWI(final boolean skipDoubleDom) { + private SortStack.stackElement takeRWI(final boolean skipDoubleDom) { // returns from the current RWI list the best entry and removes this entry from the list SortStack m; SortStack.stackElement rwi; @@ -328,16 +322,19 @@ public final class RankingProcess extends Thread { return bestEntry; } - public URLMetadataRow bestURL(final boolean skipDoubleDom) { + public URLMetadataRow takeURL(final boolean skipDoubleDom) { // returns from the current RWI list the best URL entry and removes this entry from the list while ((stack.size() > 0) || (size() > 0)) { if (((stack.size() == 0) && (size() == 0))) break; - final SortStack.stackElement obrwi = bestRWI(skipDoubleDom); + final SortStack.stackElement obrwi = takeRWI(skipDoubleDom); if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause? final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); if (u != null) { final URLMetadataRow.Components metadata = u.metadata(); + // TODO: check url constraints + + // evaluate information of metadata for navigation // author navigation: String author = metadata.dc_creator(); @@ -376,11 +373,11 @@ public final class RankingProcess extends Thread { return null; } - public URLMetadataRow bestURL(final boolean skipDoubleDom, long timeout) { + public URLMetadataRow takeURL(final boolean skipDoubleDom, long timeout) { timeout += System.currentTimeMillis(); long wait = 10; while (System.currentTimeMillis() < timeout) { - URLMetadataRow row = bestURL(skipDoubleDom); + URLMetadataRow row = takeURL(skipDoubleDom); if (row != null) return row; try {Thread.sleep(wait);} catch (final InterruptedException e1) {} wait = wait * 2; @@ -391,8 +388,9 @@ public final class RankingProcess extends Thread { public int size() { //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); int c = stack.size(); - final Iterator> i = this.doubleDomCache.values().iterator(); - while (i.hasNext()) c += i.next().size(); + for (SortStack s: this.doubleDomCache.values()) { + c += s.size(); + } return c; } diff --git a/source/de/anomic/search/SnippetFetcher.java b/source/de/anomic/search/ResultFetcher.java similarity index 98% rename from source/de/anomic/search/SnippetFetcher.java rename to source/de/anomic/search/ResultFetcher.java index ed1f8431f..038feb0f5 100644 --- a/source/de/anomic/search/SnippetFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -45,7 +45,7 @@ import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.logging.Log; import de.anomic.ymage.ProfilingGraph; -public class SnippetFetcher { +public class ResultFetcher { protected final static int workerThreadCount = 10; @@ -66,7 +66,7 @@ public class SnippetFetcher { @SuppressWarnings("unchecked") - SnippetFetcher( + ResultFetcher( RankingProcess rankedCache, final QueryParams query, final Segment indexSegment, @@ -112,7 +112,7 @@ public class SnippetFetcher { } } - ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) { + protected ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) { // a search result entry needs some work to produce a result Entry: // - check if url entry exists in LURL-db @@ -276,7 +276,7 @@ public class SnippetFetcher { if ((query.contentdom != QueryParams.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + fetchAhead)) break; // get next entry - page = rankedCache.bestURL(true, 10000); + page = rankedCache.takeURL(true, 10000); if (page == null) break; if (result.exists(page.hash().hashCode())) continue; if (failedURLs.get(page.hash()) != null) continue; diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 2772c37c5..df164896a 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -66,7 +66,7 @@ public final class SearchEvent { private final Segment indexSegment; private final yacySeedDB peers; private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container - private SnippetFetcher snippets; + private ResultFetcher snippets; // class variables for search abstracts private final IndexAbstracts rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation @@ -144,7 +144,7 @@ public final class SearchEvent { } else { // do a local search this.rankedCache = new RankingProcess(indexSegment, query, max_results_preparation, 2); - this.rankedCache.execQuery(); + this.rankedCache.run(); //CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); if (generateAbstracts) { @@ -176,7 +176,7 @@ public final class SearchEvent { } // start worker threads to fetch urls and snippets - this.snippets = new SnippetFetcher(rankedCache, query, indexSegment, peers); + this.snippets = new ResultFetcher(rankedCache, query, indexSegment, peers); // clean up events SearchEventCache.cleanupEvents(false); @@ -400,7 +400,7 @@ public final class SearchEvent { //assert e != null; } - public SnippetFetcher result() { + public ResultFetcher result() { return this.snippets; } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 1d4d0d0d1..939ce94ce 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -602,7 +602,7 @@ public final class yacyClient { // store remote result to local result container synchronized (containerCache) { // insert one container into the search result buffer - containerCache.insertRanked(container[0], false, joincount); // one is enough + containerCache.add(container[0], false, joincount); // one is enough // integrate remote topwords final String references = result.get("references");