From a4805defddd08c46ad0dc14a8e3fdcedf9db7f35 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 15 Jun 2009 11:46:23 +0000 Subject: [PATCH] added stub for new search process git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6074 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/search/IndexCollector.java | 275 ++++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 source/de/anomic/search/IndexCollector.java diff --git a/source/de/anomic/search/IndexCollector.java b/source/de/anomic/search/IndexCollector.java new file mode 100644 index 000000000..8114ff426 --- /dev/null +++ b/source/de/anomic/search/IndexCollector.java @@ -0,0 +1,275 @@ +// IndexCollector.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 15.06.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.search; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.LinkedBlockingQueue; + +import de.anomic.kelondro.index.BinSearch; +import de.anomic.kelondro.text.ReferenceContainer; +import de.anomic.kelondro.text.ReferenceOrder; +import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.TermSearch; +import de.anomic.kelondro.text.referencePrototype.WordReference; +import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; +import de.anomic.kelondro.util.SortStack; +import de.anomic.plasma.plasmaProfiling; +import de.anomic.plasma.plasmaSearchEvent; +import de.anomic.plasma.parser.Condenser; +import de.anomic.search.Query; +import de.anomic.server.serverProfiling; +import de.anomic.yacy.yacyURL; + +public final class IndexCollector extends Thread { + + public static BinSearch[] ybrTables = null; // block-rank tables + public static final int maxYBR = 3; // the lower this value, the faster the search + public static final ReferenceContainer poison = ReferenceContainer.emptyContainer(Segment.wordReferenceFactory, null, 0); + + private final SortStack stack; + private final Query query; + private final int maxentries; + private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; + private final ReferenceOrder order; + private final ConcurrentHashMap urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) + private final int[] flagcount; // flag counter + private final Segment indexSegment; + private final int[] domZones; + private final ConcurrentHashMap hostNavigator; + private HashMap> localSearchInclusion; + private final BlockingQueue> rwiQueue; + + public IndexCollector( + final Segment indexSegment, + final Query query, + final int maxentries, + final int concurrency) { + // we collect the urlhashes and construct a list with urlEntry objects + // attention: if minEntries is too high, this method will not terminate within the maxTime + // sortorder: 0 = hash, 1 = url, 2 = ranking + this.stack = new SortStack(maxentries); + this.order = (query == null) ? null : new ReferenceOrder(query.ranking, query.targetlang); + this.query = query; + this.maxentries = maxentries; + this.remote_peerCount = 0; + this.remote_indexCount = 0; + this.remote_resourceSize = 0; + this.local_resourceSize = 0; + this.urlhashes = new ConcurrentHashMap(0, 0.75f, concurrency); + this.indexSegment = indexSegment; + this.flagcount = new int[32]; + for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} + this.hostNavigator = new ConcurrentHashMap(); + this.domZones = new int[8]; + for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} + this.localSearchInclusion = null; + this.rwiQueue = new LinkedBlockingQueue>(); + } + + public long ranking(final WordReferenceVars word) { + return order.cardinal(word); + } + + public int[] zones() { + return this.domZones; + } + + public void insertRanked(final ReferenceContainer index, final boolean local, final int fullResource) { + // we collect the urlhashes and construct a list with urlEntry objects + // attention: if minEntries is too high, this method will not terminate within the maxTime + + assert (index != null); + if (index.size() == 0) return; + if (local) { + this.local_resourceSize += fullResource; + } else { + this.remote_resourceSize += fullResource; + this.remote_peerCount++; + this.remote_indexCount += index.size(); + } + try { + this.rwiQueue.put(index); + } catch (InterruptedException e) { + e.printStackTrace(); + } + + } + + public void shutdown(boolean waitfor) { + try { + this.rwiQueue.put(poison); + } catch (InterruptedException e) { + e.printStackTrace(); + } + if (waitfor && this.isAlive()) try {this.join();} catch (InterruptedException e) {} + } + + public void run() { + + long timer = System.currentTimeMillis(); + final TermSearch search = this.indexSegment.termIndex().query( + query.queryHashes, + query.excludeHashes, + null, + Segment.wordReferenceFactory, + query.maxDistance); + this.localSearchInclusion = search.inclusion(); + ReferenceContainer index = search.joined(); + insertRanked(index, true, index.size()); + + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false); + + try { + while ((index = this.rwiQueue.take()) != poison) { + + // normalize entries + final ArrayList decodedEntries = this.order.normalizeWith(index); + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false); + + // iterate over normalized entries and select some that are better than currently stored + timer = System.currentTimeMillis(); + final Iterator i = decodedEntries.iterator(); + WordReferenceVars iEntry; + Long r; + HostInfo hs; + String domhash; + boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; + while (i.hasNext()) { + iEntry = i.next(); + assert (iEntry.metadataHash().length() == index.row().primaryKeyLength); + //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; + + // increase flag counts + for (int j = 0; j < 32; j++) { + if (iEntry.flags().get(j)) {flagcount[j]++;} + } + + // kick out entries that are too bad according to current findings + r = Long.valueOf(order.cardinal(iEntry)); + if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue; + + // check constraints + if (!testFlags(iEntry)) continue; + + // check document domain + if (query.contentdom != Query.CONTENTDOM_TEXT) { + if ((query.contentdom == Query.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue; + if ((query.contentdom == Query.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue; + if ((query.contentdom == Query.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue; + if ((query.contentdom == Query.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue; + } + + // check tld domain + if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) { + // filter out all tld that do not match with wanted tld domain + continue; + } + + // check site constraints + if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) { + // filter out all domains that do not match with the site constraint + continue; + } + + // count domZones + this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++; + + // get statistics for host navigator + if (nav_hosts) { + domhash = iEntry.urlHash.substring(6); + hs = this.hostNavigator.get(domhash); + if (hs == null) { + this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash)); + } else { + hs.inc(); + } + } + + // insert + if ((maxentries < 0) || (stack.size() < maxentries)) { + // in case that we don't have enough yet, accept any new entry + if (urlhashes.containsKey(iEntry.metadataHash())) continue; + stack.push(iEntry, r); + } else { + // if we already have enough entries, insert only such that are necessary to get a better result + if (stack.bottom(r.longValue())) { + continue; + } + // double-check + if (urlhashes.containsKey(iEntry.metadataHash())) continue; + stack.push(iEntry, r); + } + + } + } + } catch (InterruptedException e) { + e.printStackTrace(); + } + //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false); + } + + public Map> searchContainerMap() { + // direct access to the result maps is needed for abstract generation + // this is only available if execQuery() was called before + return localSearchInclusion; + } + + private boolean testFlags(final WordReference ientry) { + if (query.constraint == null) return true; + // test if ientry matches with filter + // if all = true: let only entries pass that has all matching bits + // if all = false: let all entries pass that has at least one matching bit + if (query.allofconstraint) { + for (int i = 0; i < 32; i++) { + if ((query.constraint.get(i)) && (!ientry.flags().get(i))) return false; + } + return true; + } + for (int i = 0; i < 32; i++) { + if ((query.constraint.get(i)) && (ientry.flags().get(i))) return true; + } + return false; + } + + public class HostInfo { + public int count; + public String hashsample; + public HostInfo(String urlhash) { + this.count = 1; + this.hashsample = urlhash; + } + public void inc() { + this.count++; + } + } +} \ No newline at end of file