more refactoring of search

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6270 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 323a8e733d
commit 61748285c3

@ -366,7 +366,7 @@ public class IndexControlRWIs_p {
URLMetadataRow entry; URLMetadataRow entry;
String us; String us;
long rn = -1; long rn = -1;
while ((ranked.size() > 0) && ((entry = ranked.bestURL(false)) != null)) { while ((ranked.size() > 0) && ((entry = ranked.takeURL(false)) != null)) {
if ((entry == null) || (entry.metadata() == null)) continue; if ((entry == null) || (entry.metadata() == null)) continue;
url = entry.metadata().url(); url = entry.metadata().url();
if (url == null) continue; if (url == null) continue;
@ -480,7 +480,7 @@ public class IndexControlRWIs_p {
public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, final byte[] keyhash, final Bitfield filter) { public static RankingProcess genSearchresult(final serverObjects prop, final Switchboard sb, final byte[] keyhash, final Bitfield filter) {
final QueryParams query = new QueryParams(new String(keyhash), -1, sb.getRanking(), filter); final QueryParams query = new QueryParams(new String(keyhash), -1, sb.getRanking(), filter);
final RankingProcess ranked = new RankingProcess(sb.indexSegment, query, Integer.MAX_VALUE, 1); final RankingProcess ranked = new RankingProcess(sb.indexSegment, query, Integer.MAX_VALUE, 1);
ranked.execQuery(); ranked.run();
if (ranked.filteredCount() == 0) { if (ranked.filteredCount() == 0) {
prop.put("searchresult", 2); prop.put("searchresult", 2);

@ -65,19 +65,21 @@ public final class RankingProcess extends Thread {
private static boolean useYBR = true; private static boolean useYBR = true;
private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000; private static final int maxDoubleDomAll = 20, maxDoubleDomSpecial = 10000;
private final SortStack<WordReferenceVars> stack; private final Segment indexSegment;
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final HashSet<String> handover; // key = urlhash; used for double-check of urls that had been handed over to search process
private final QueryParams query; private final QueryParams query;
private final int maxentries; private final int maxentries;
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private final ReferenceOrder order; private final ReferenceOrder order;
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final int[] flagcount; // flag counter private final int[] flagcount; // flag counter
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
private final Segment indexSegment;
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private final int[] domZones; private final int[] domZones;
private HashMap<byte[], ReferenceContainer<WordReference>> localSearchInclusion;
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private final SortStack<WordReferenceVars> stack;
private final HashMap<String, SortStack<WordReferenceVars>> doubleDomCache; // key = domhash (6 bytes); value = like stack
private final HashSet<String> handover; // key = urlhash; used for double-check of urls that had been handed over to search process
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
private final ConcurrentHashMap<String, HostInfo> hostNavigator; private final ConcurrentHashMap<String, HostInfo> hostNavigator;
private final ConcurrentHashMap<String, AuthorInfo> authorNavigator; private final ConcurrentHashMap<String, AuthorInfo> authorNavigator;
@ -114,12 +116,26 @@ public final class RankingProcess extends Thread {
} }
public void run() { public void run() {
// do a search concurrently // do a search
// sort the local containers and truncate it to a limited count, // sort the local containers and truncate it to a limited count,
// so following sortings together with the global results will be fast // so following sortings together with the global results will be fast
try { try {
execQuery(); long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.indexSegment.termIndex().query(
query.queryHashes,
query.excludeHashes,
null,
Segment.wordReferenceFactory,
query.maxDistance);
this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false);
if (index.size() == 0) {
return;
}
add(index, true, index.size());
} catch (final Exception e) { } catch (final Exception e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -133,26 +149,7 @@ public final class RankingProcess extends Thread {
return this.domZones; return this.domZones;
} }
public void execQuery() { public void add(final ReferenceContainer<WordReference> index, final boolean local, final int fullResource) {
long timer = System.currentTimeMillis();
final TermSearch<WordReference> search = this.indexSegment.termIndex().query(
query.queryHashes,
query.excludeHashes,
null,
Segment.wordReferenceFactory,
query.maxDistance);
this.localSearchInclusion = search.inclusion();
final ReferenceContainer<WordReference> index = search.joined();
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false);
if (index.size() == 0) {
return;
}
insertRanked(index, true, index.size());
}
public void insertRanked(final ReferenceContainer<WordReference> index, final boolean local, final int fullResource) {
// we collect the urlhashes and construct a list with urlEntry objects // we collect the urlhashes and construct a list with urlEntry objects
// attention: if minEntries is too high, this method will not terminate within the maxTime // attention: if minEntries is too high, this method will not terminate within the maxTime
@ -173,14 +170,11 @@ public final class RankingProcess extends Thread {
// iterate over normalized entries and select some that are better than currently stored // iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis(); timer = System.currentTimeMillis();
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r; Long r;
HostInfo hs; HostInfo hs;
String domhash; String domhash;
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
while (i.hasNext()) { for (WordReferenceVars iEntry: decodedEntries) {
iEntry = i.next();
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength); assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
@ -282,7 +276,7 @@ public final class RankingProcess extends Thread {
// - root-domain guessing to prefer the root domain over other urls if search word appears in domain name // - root-domain guessing to prefer the root domain over other urls if search word appears in domain name
private SortStack<WordReferenceVars>.stackElement bestRWI(final boolean skipDoubleDom) { private SortStack<WordReferenceVars>.stackElement takeRWI(final boolean skipDoubleDom) {
// returns from the current RWI list the best entry and removes this entry from the list // returns from the current RWI list the best entry and removes this entry from the list
SortStack<WordReferenceVars> m; SortStack<WordReferenceVars> m;
SortStack<WordReferenceVars>.stackElement rwi; SortStack<WordReferenceVars>.stackElement rwi;
@ -328,16 +322,19 @@ public final class RankingProcess extends Thread {
return bestEntry; return bestEntry;
} }
public URLMetadataRow bestURL(final boolean skipDoubleDom) { public URLMetadataRow takeURL(final boolean skipDoubleDom) {
// returns from the current RWI list the best URL entry and removes this entry from the list // returns from the current RWI list the best URL entry and removes this entry from the list
while ((stack.size() > 0) || (size() > 0)) { while ((stack.size() > 0) || (size() > 0)) {
if (((stack.size() == 0) && (size() == 0))) break; if (((stack.size() == 0) && (size() == 0))) break;
final SortStack<WordReferenceVars>.stackElement obrwi = bestRWI(skipDoubleDom); final SortStack<WordReferenceVars>.stackElement obrwi = takeRWI(skipDoubleDom);
if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause? if (obrwi == null) continue; // *** ? this happened and the thread was suspended silently. cause?
final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); final URLMetadataRow u = indexSegment.urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
if (u != null) { if (u != null) {
final URLMetadataRow.Components metadata = u.metadata(); final URLMetadataRow.Components metadata = u.metadata();
// TODO: check url constraints
// evaluate information of metadata for navigation // evaluate information of metadata for navigation
// author navigation: // author navigation:
String author = metadata.dc_creator(); String author = metadata.dc_creator();
@ -376,11 +373,11 @@ public final class RankingProcess extends Thread {
return null; return null;
} }
public URLMetadataRow bestURL(final boolean skipDoubleDom, long timeout) { public URLMetadataRow takeURL(final boolean skipDoubleDom, long timeout) {
timeout += System.currentTimeMillis(); timeout += System.currentTimeMillis();
long wait = 10; long wait = 10;
while (System.currentTimeMillis() < timeout) { while (System.currentTimeMillis() < timeout) {
URLMetadataRow row = bestURL(skipDoubleDom); URLMetadataRow row = takeURL(skipDoubleDom);
if (row != null) return row; if (row != null) return row;
try {Thread.sleep(wait);} catch (final InterruptedException e1) {} try {Thread.sleep(wait);} catch (final InterruptedException e1) {}
wait = wait * 2; wait = wait * 2;
@ -391,8 +388,9 @@ public final class RankingProcess extends Thread {
public int size() { public int size() {
//assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size(); //assert sortedRWIEntries.size() == urlhashes.size() : "sortedRWIEntries.size() = " + sortedRWIEntries.size() + ", urlhashes.size() = " + urlhashes.size();
int c = stack.size(); int c = stack.size();
final Iterator<SortStack<WordReferenceVars>> i = this.doubleDomCache.values().iterator(); for (SortStack<WordReferenceVars> s: this.doubleDomCache.values()) {
while (i.hasNext()) c += i.next().size(); c += s.size();
}
return c; return c;
} }

@ -45,7 +45,7 @@ import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.logging.Log; import de.anomic.yacy.logging.Log;
import de.anomic.ymage.ProfilingGraph; import de.anomic.ymage.ProfilingGraph;
public class SnippetFetcher { public class ResultFetcher {
protected final static int workerThreadCount = 10; protected final static int workerThreadCount = 10;
@ -66,7 +66,7 @@ public class SnippetFetcher {
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
SnippetFetcher( ResultFetcher(
RankingProcess rankedCache, RankingProcess rankedCache,
final QueryParams query, final QueryParams query,
final Segment indexSegment, final Segment indexSegment,
@ -112,7 +112,7 @@ public class SnippetFetcher {
} }
} }
ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) { protected ResultEntry obtainResultEntry(final URLMetadataRow page, final int snippetFetchMode) {
// a search result entry needs some work to produce a result Entry: // a search result entry needs some work to produce a result Entry:
// - check if url entry exists in LURL-db // - check if url entry exists in LURL-db
@ -276,7 +276,7 @@ public class SnippetFetcher {
if ((query.contentdom != QueryParams.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + fetchAhead)) break; if ((query.contentdom != QueryParams.CONTENTDOM_IMAGE) && (result.size() >= query.neededResults() + fetchAhead)) break;
// get next entry // get next entry
page = rankedCache.bestURL(true, 10000); page = rankedCache.takeURL(true, 10000);
if (page == null) break; if (page == null) break;
if (result.exists(page.hash().hashCode())) continue; if (result.exists(page.hash().hashCode())) continue;
if (failedURLs.get(page.hash()) != null) continue; if (failedURLs.get(page.hash()) != null) continue;

@ -66,7 +66,7 @@ public final class SearchEvent {
private final Segment indexSegment; private final Segment indexSegment;
private final yacySeedDB peers; private final yacySeedDB peers;
private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container private RankingProcess rankedCache; // ordered search results, grows dynamically as all the query threads enrich this container
private SnippetFetcher snippets; private ResultFetcher snippets;
// class variables for search abstracts // class variables for search abstracts
private final IndexAbstracts rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation private final IndexAbstracts rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation
@ -144,7 +144,7 @@ public final class SearchEvent {
} else { } else {
// do a local search // do a local search
this.rankedCache = new RankingProcess(indexSegment, query, max_results_preparation, 2); this.rankedCache = new RankingProcess(indexSegment, query, max_results_preparation, 2);
this.rankedCache.execQuery(); this.rankedCache.run();
//CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process); //CrawlSwitchboard.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
if (generateAbstracts) { if (generateAbstracts) {
@ -176,7 +176,7 @@ public final class SearchEvent {
} }
// start worker threads to fetch urls and snippets // start worker threads to fetch urls and snippets
this.snippets = new SnippetFetcher(rankedCache, query, indexSegment, peers); this.snippets = new ResultFetcher(rankedCache, query, indexSegment, peers);
// clean up events // clean up events
SearchEventCache.cleanupEvents(false); SearchEventCache.cleanupEvents(false);
@ -400,7 +400,7 @@ public final class SearchEvent {
//assert e != null; //assert e != null;
} }
public SnippetFetcher result() { public ResultFetcher result() {
return this.snippets; return this.snippets;
} }

@ -602,7 +602,7 @@ public final class yacyClient {
// store remote result to local result container // store remote result to local result container
synchronized (containerCache) { synchronized (containerCache) {
// insert one container into the search result buffer // insert one container into the search result buffer
containerCache.insertRanked(container[0], false, joincount); // one is enough containerCache.add(container[0], false, joincount); // one is enough
// integrate remote topwords // integrate remote topwords
final String references = result.get("references"); final String references = result.get("references");

Loading…
Cancel
Save