diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index b32658f51..239f40f27 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -31,6 +31,7 @@ import java.util.TreeSet; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.TermSearch; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaSearchQuery; @@ -78,13 +79,14 @@ public final class timeline { // retrieve index containers //yacyCore.log.logInfo("INIT TIMELINE SEARCH: " + plasmaSearchQuery.anonymizedQueryHashes(query[0]) + " - " + count + " links"); - // get the index container with the result vector - final ReferenceContainer index = sb.indexSegment.termIndex().query( + // get the index container with the result vector + final TermSearch search = sb.indexSegment.termIndex().query( q, Word.words2hashes(query[1]), null, Segment.wordReferenceFactory, maxdist); + ReferenceContainer index = search.joined(); Iterator i = index.entries(); WordReference entry; diff --git a/source/de/anomic/kelondro/text/AbstractIndex.java b/source/de/anomic/kelondro/text/AbstractIndex.java index 4842771cc..db34c68ec 100644 --- a/source/de/anomic/kelondro/text/AbstractIndex.java +++ b/source/de/anomic/kelondro/text/AbstractIndex.java @@ -122,40 +122,13 @@ public abstract class AbstractIndex implements return containers; } - @SuppressWarnings("unchecked") - public HashMap>[] searchTerm( - final TreeSet queryHashes, - final TreeSet excludeHashes, - final Set urlselection) { - // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result - - // retrieve entities that belong to the hashes - HashMap> inclusionContainers = - (queryHashes.size() == 0) ? - new HashMap>(0) : - this.searchConjunction(queryHashes, urlselection); - if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap>(0); // prevent that only a subset is returned - final HashMap> exclusionContainers = - (inclusionContainers.size() == 0) ? - new HashMap>(0) : - this.searchConjunction(excludeHashes, urlselection); - return new HashMap[]{inclusionContainers, exclusionContainers}; - } - - public ReferenceContainer query( + public TermSearch query( final TreeSet queryHashes, final TreeSet excludeHashes, final Set urlselection, ReferenceFactory termFactory, int maxDistance) { - - HashMap>[] containerMaps = searchTerm(queryHashes, excludeHashes, urlselection); - - // join and exclude the result - return ReferenceContainer.joinExcludeContainers( - termFactory, - containerMaps[0].values(), - containerMaps[1].values(), - maxDistance); + + return new TermSearch(this, queryHashes, excludeHashes, urlselection, termFactory, maxDistance); } } diff --git a/source/de/anomic/kelondro/text/Index.java b/source/de/anomic/kelondro/text/Index.java index 6d4555e43..efd5e4b40 100644 --- a/source/de/anomic/kelondro/text/Index.java +++ b/source/de/anomic/kelondro/text/Index.java @@ -29,6 +29,7 @@ package de.anomic.kelondro.text; import java.io.IOException; +import java.util.HashMap; import java.util.Set; import java.util.TreeSet; @@ -131,7 +132,16 @@ public interface Index { boolean rot, int count ) throws IOException; - + + /** + * collect containers for given word hashes. This collection stops if a single container does not contain any references. + * In that case only a empty result is returned. + * @param wordHashes + * @param urlselection + * @return map of wordhash:indexContainer + */ + public HashMap> searchConjunction(final TreeSet wordHashes, final Set urlselection); + /** * delete all references entries * @throws IOException diff --git a/source/de/anomic/kelondro/text/TermSearch.java b/source/de/anomic/kelondro/text/TermSearch.java new file mode 100644 index 000000000..749564281 --- /dev/null +++ b/source/de/anomic/kelondro/text/TermSearch.java @@ -0,0 +1,81 @@ +// TermSearch.java +// --------------- +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 3.6.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.kelondro.text; + +import java.util.HashMap; +import java.util.Set; +import java.util.TreeSet; + +public class TermSearch { + + private ReferenceContainer joinResult; + HashMap> inclusionContainers, exclusionContainers; + + public TermSearch( + Index base, + final TreeSet queryHashes, + final TreeSet excludeHashes, + final Set urlselection, + ReferenceFactory termFactory, + int maxDistance) { + + this.inclusionContainers = + (queryHashes.size() == 0) ? + new HashMap>(0) : + base.searchConjunction(queryHashes, urlselection); + + if ((inclusionContainers.size() != 0) && + (inclusionContainers.size() < queryHashes.size())) + inclusionContainers = new HashMap>(0); // prevent that only a subset is returned + + this.exclusionContainers = + (inclusionContainers.size() == 0) ? + new HashMap>(0) : + base.searchConjunction(excludeHashes, urlselection); + + // join and exclude the result + this.joinResult = ReferenceContainer.joinExcludeContainers( + termFactory, + inclusionContainers.values(), + exclusionContainers.values(), + maxDistance); + } + + public ReferenceContainer joined() { + return this.joinResult; + } + + public HashMap> inclusion() { + return this.inclusionContainers; + } + + public HashMap> exclusion() { + return this.exclusionContainers; + } + +} diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 67946cfe8..40236c8a7 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -187,9 +187,8 @@ public final class plasmaSearchEvent { int maxcount = -1; long mindhtdistance = Long.MAX_VALUE, l; byte[] wordhash; - assert this.rankedCache.searchContainerMaps() != null; - assert this.rankedCache.searchContainerMaps()[0] != null; - for (Map.Entry> entry : this.rankedCache.searchContainerMaps()[0].entrySet()) { + assert this.rankedCache.searchContainerMap() != null; + for (Map.Entry> entry : this.rankedCache.searchContainerMap().entrySet()) { wordhash = entry.getKey(); final ReferenceContainer container = entry.getValue(); assert (container.getTermHash().equals(wordhash)); @@ -206,7 +205,7 @@ public final class plasmaSearchEvent { IACount.put(wordhash, Integer.valueOf(container.size())); IAResults.put(wordhash, ReferenceContainer.compressIndex(container, null, 1000).toString()); } - serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "abstract generation", this.rankedCache.searchContainerMaps()[0].size(), System.currentTimeMillis() - timer), false); + serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "abstract generation", this.rankedCache.searchContainerMap().size(), System.currentTimeMillis() - timer), false); } } diff --git a/source/de/anomic/plasma/plasmaSearchRankingProcess.java b/source/de/anomic/plasma/plasmaSearchRankingProcess.java index 050cc9793..20be57ab2 100644 --- a/source/de/anomic/plasma/plasmaSearchRankingProcess.java +++ b/source/de/anomic/plasma/plasmaSearchRankingProcess.java @@ -45,6 +45,7 @@ import de.anomic.kelondro.text.Reference; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.ReferenceOrder; import de.anomic.kelondro.text.Segment; +import de.anomic.kelondro.text.TermSearch; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; @@ -73,7 +74,7 @@ public final class plasmaSearchRankingProcess { private final int[] flagcount; // flag counter private final TreeSet misses; // contains url-hashes that could not been found in the LURL-DB private final Segment indexSegment; - private HashMap>[] localSearchContainerMaps; + private HashMap> localSearchInclusion; private final int[] domZones; private final ConcurrentHashMap hostNavigator; private final ConcurrentHashMap ref; // reference score computation for the commonSense heuristic @@ -86,7 +87,7 @@ public final class plasmaSearchRankingProcess { // we collect the urlhashes and construct a list with urlEntry objects // attention: if minEntries is too high, this method will not terminate within the maxTime // sortorder: 0 = hash, 1 = url, 2 = ranking - this.localSearchContainerMaps = null; + this.localSearchInclusion = null; this.stack = new SortStack(maxentries); this.doubleDomCache = new HashMap>(); this.handover = new HashMap(); @@ -119,12 +120,14 @@ public final class plasmaSearchRankingProcess { public void execQuery() { long timer = System.currentTimeMillis(); - final ReferenceContainer index = this.indexSegment.termIndex().query( + final TermSearch search = this.indexSegment.termIndex().query( query.queryHashes, query.excludeHashes, null, Segment.wordReferenceFactory, query.maxDistance); + this.localSearchInclusion = search.inclusion(); + final ReferenceContainer index = search.joined(); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.JOIN, index.size(), System.currentTimeMillis() - timer), false); if (index.size() == 0) { return; @@ -248,10 +251,10 @@ public final class plasmaSearchRankingProcess { return false; } - public Map>[] searchContainerMaps() { + public Map> searchContainerMap() { // direct access to the result maps is needed for abstract generation // this is only available if execQuery() was called before - return localSearchContainerMaps; + return localSearchInclusion; } // todo: