From e4d561971e59edd2bceed66d2a9d74c541c4ba79 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 14 Oct 2010 11:40:02 +0000 Subject: [PATCH] added more score cluster options and made score cluster usage more transparent git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7248 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Supporter.java | 7 +- htroot/Surftips.java | 7 +- source/de/anomic/crawler/ResultURLs.java | 15 +- source/de/anomic/data/DidYouMean.java | 5 +- source/de/anomic/data/DidYouMeanLibrary.java | 29 +- .../de/anomic/search/MetadataRepository.java | 8 +- source/de/anomic/search/ReferenceOrder.java | 5 +- source/de/anomic/yacy/dht/PeerSelection.java | 7 +- source/de/anomic/yacy/yacySearch.java | 5 +- .../net/yacy/cora/storage/DynamicScore.java | 31 ++ source/net/yacy/cora/storage/IntScore.java | 96 +++++++ .../util => cora/storage}/ScoreCluster.java | 244 +++++----------- source/net/yacy/cora/storage/ScoreMap.java | 265 ++++++++++++++++++ source/net/yacy/cora/storage/StaticScore.java | 67 +++++ .../net/yacy/kelondro/blob/MapDataMining.java | 15 +- source/net/yacy/yacy.java | 5 +- 16 files changed, 604 insertions(+), 207 deletions(-) create mode 100644 source/net/yacy/cora/storage/DynamicScore.java create mode 100644 source/net/yacy/cora/storage/IntScore.java rename source/net/yacy/{kelondro/util => cora/storage}/ScoreCluster.java (60%) create mode 100644 source/net/yacy/cora/storage/ScoreMap.java create mode 100644 source/net/yacy/cora/storage/StaticScore.java diff --git a/htroot/Supporter.java b/htroot/Supporter.java index 62db2f4ef..f011379e8 100644 --- a/htroot/Supporter.java +++ b/htroot/Supporter.java @@ -32,12 +32,13 @@ import java.util.HashMap; import java.util.Iterator; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.repository.Blacklist; import de.anomic.search.Switchboard; @@ -104,7 +105,7 @@ public class Supporter { accumulateVotes(sb, negativeHashes, positiveHashes, yacyNewsPool.INCOMING_DB); //accumulateVotes(negativeHashes, positiveHashes, yacyNewsPool.OUTGOING_DB); //accumulateVotes(negativeHashes, positiveHashes, yacyNewsPool.PUBLISHED_DB); - final ScoreCluster ranking = new ScoreCluster(); // score cluster for url hashes + final DynamicScore ranking = new ScoreCluster(); // score cluster for url hashes final Row rowdef = new Row("String url-255, String title-120, String description-120, String refid-" + (DateFormatter.PATTERN_SHORT_SECOND.length() + 12), NaturalOrder.naturalOrder); final HashMap Supporter = new HashMap(); // a mapping from an url hash to a kelondroRow.Entry with display properties accumulateSupporter(sb, Supporter, ranking, rowdef, negativeHashes, positiveHashes, yacyNewsPool.INCOMING_DB); @@ -197,7 +198,7 @@ public class Supporter { private static void accumulateSupporter( final Switchboard sb, - final HashMap Supporter, final ScoreCluster ranking, final Row rowdef, + final HashMap Supporter, final DynamicScore ranking, final Row rowdef, final HashMap negativeHashes, final HashMap positiveHashes, final int dbtype) { final int maxCount = Math.min(1000, sb.peers.newsPool.size(dbtype)); yacyNewsDB.Record record; diff --git a/htroot/Surftips.java b/htroot/Surftips.java index 021754d28..865520a95 100644 --- a/htroot/Surftips.java +++ b/htroot/Surftips.java @@ -32,12 +32,13 @@ import java.util.HashMap; import java.util.Iterator; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.repository.Blacklist; import de.anomic.search.Switchboard; @@ -112,7 +113,7 @@ public class Surftips { accumulateVotes(sb , negativeHashes, positiveHashes, yacyNewsPool.INCOMING_DB); //accumulateVotes(negativeHashes, positiveHashes, yacyNewsPool.OUTGOING_DB); //accumulateVotes(negativeHashes, positiveHashes, yacyNewsPool.PUBLISHED_DB); - final ScoreCluster ranking = new ScoreCluster(); // score cluster for url hashes + final DynamicScore ranking = new ScoreCluster(); // score cluster for url hashes final Row rowdef = new Row("String url-255, String title-120, String description-120, String refid-" + (DateFormatter.PATTERN_SHORT_SECOND.length() + 12), NaturalOrder.naturalOrder); final HashMap surftips = new HashMap(); // a mapping from an url hash to a kelondroRow.Entry with display properties accumulateSurftips(sb, surftips, ranking, rowdef, negativeHashes, positiveHashes, yacyNewsPool.INCOMING_DB); @@ -206,7 +207,7 @@ public class Surftips { private static void accumulateSurftips( final Switchboard sb, - final HashMap surftips, final ScoreCluster ranking, final Row rowdef, + final HashMap surftips, final DynamicScore ranking, final Row rowdef, final HashMap negativeHashes, final HashMap positiveHashes, final int dbtype) { final int maxCount = Math.min(1000, sb.peers.newsPool.size(dbtype)); yacyNewsDB.Record record; diff --git a/source/de/anomic/crawler/ResultURLs.java b/source/de/anomic/crawler/ResultURLs.java index 27bbac4cc..72ee5fc3d 100644 --- a/source/de/anomic/crawler/ResultURLs.java +++ b/source/de/anomic/crawler/ResultURLs.java @@ -32,19 +32,20 @@ import java.util.LinkedHashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.storage.ScoreCluster; +import net.yacy.cora.storage.DynamicScore; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.util.ReverseMapIterator; -import net.yacy.kelondro.util.ScoreCluster; import de.anomic.crawler.retrieval.EventOrigin; public final class ResultURLs { private final Map> resultStacks; // a mapping from urlHash to Entries - private final Map> resultDomains; + private final Map> resultDomains; public class InitExecEntry { public byte[] initiatorHash, executorHash; @@ -57,7 +58,7 @@ public final class ResultURLs { public ResultURLs(int initialStackCapacity) { // init result stacks resultStacks = new ConcurrentHashMap>(initialStackCapacity); - resultDomains = new ConcurrentHashMap>(initialStackCapacity); + resultDomains = new ConcurrentHashMap>(initialStackCapacity); for (EventOrigin origin: EventOrigin.values()) { resultStacks.put(origin, new LinkedHashMap()); resultDomains.put(origin, new ScoreCluster()); @@ -82,7 +83,7 @@ public final class ResultURLs { return; } try { - final ScoreCluster domains = getDomains(stackType); + final DynamicScore domains = getDomains(stackType); if (domains != null) { domains.incScore(e.metadata().url().getHost()); } @@ -99,7 +100,7 @@ public final class ResultURLs { } public int getDomainListSize(final EventOrigin stack) { - final ScoreCluster domains = getDomains(stack); + final DynamicScore domains = getDomains(stack); if (domains == null) return 0; return domains.size(); } @@ -155,7 +156,7 @@ public final class ResultURLs { private Map getStack(final EventOrigin stack) { return resultStacks.get(stack); } - private ScoreCluster getDomains(final EventOrigin stack) { + private DynamicScore getDomains(final EventOrigin stack) { return resultDomains.get(stack); } @@ -166,7 +167,7 @@ public final class ResultURLs { public synchronized void clearStack(final EventOrigin stack) { final Map resultStack = getStack(stack); if (resultStack != null) resultStack.clear(); - final ScoreCluster resultDomains = getDomains(stack); + final DynamicScore resultDomains = getDomains(stack); if (resultDomains != null) { // we do not clear this completely, just remove most of the less important entries resultDomains.shrinkToMaxSize(100); diff --git a/source/de/anomic/data/DidYouMean.java b/source/de/anomic/data/DidYouMean.java index 11efbcf5c..61817a85e 100644 --- a/source/de/anomic/data/DidYouMean.java +++ b/source/de/anomic/data/DidYouMean.java @@ -7,11 +7,12 @@ import java.util.SortedSet; import java.util.TreeSet; import java.util.concurrent.LinkedBlockingQueue; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.WordReference; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.rwi.IndexCell; -import net.yacy.kelondro.util.ScoreCluster; /** @@ -127,7 +128,7 @@ public class DidYouMean { if (this.word.indexOf(' ') > 0) return getSuggestions(this.word.split(" "), timeout, preSortSelection, this.index); SortedSet preSorted = getSuggestions(timeout); if (System.currentTimeMillis() > timelimit) return preSorted; - ScoreCluster scored = new ScoreCluster(); + DynamicScore scored = new ScoreCluster(); for (final String s: preSorted) { if (System.currentTimeMillis() > timelimit) break; if (scored.size() >= 2 * preSortSelection) break; diff --git a/source/de/anomic/data/DidYouMeanLibrary.java b/source/de/anomic/data/DidYouMeanLibrary.java index accfdc2f1..5c33b2367 100644 --- a/source/de/anomic/data/DidYouMeanLibrary.java +++ b/source/de/anomic/data/DidYouMeanLibrary.java @@ -38,6 +38,8 @@ import java.util.SortedSet; import java.util.TreeSet; import java.util.zip.GZIPInputStream; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreMap; import net.yacy.kelondro.logging.Log; /** @@ -45,10 +47,17 @@ import net.yacy.kelondro.logging.Log; * */ public class DidYouMeanLibrary { - - private final File dictionaryPath; - private TreeSet dict, tcid; + // common word cache + private static final int commonWordsMaxSize = 100000; // maximum size of common word cache + private static final int commonWordsMinLength = 4; // words must have that length at minimum + private DynamicScore commonWords = new ScoreMap(); + + // dictionaries + private final File dictionaryPath; + private TreeSet dict; // the word dictionary + private TreeSet tcid; // the dictionary of reverse words + /** * create a new dictionary * This loads all files that ends with '.words' @@ -61,6 +70,20 @@ public class DidYouMeanLibrary { reload(); } + /** + * add a word to the generic dictionary + * @param word + */ + public void learn(String word) { + if (word == null) return; + word = word.trim().toLowerCase(); + if (word.length() < commonWordsMinLength) return; + commonWords.incScore(word); + if (commonWords.size() >= commonWordsMaxSize) { + commonWords.shrinkToMaxSize(commonWordsMaxSize / 2); + } + } + /** * scan the input directory and load all dictionaries (again) */ diff --git a/source/de/anomic/search/MetadataRepository.java b/source/de/anomic/search/MetadataRepository.java index f774e3270..875104aad 100644 --- a/source/de/anomic/search/MetadataRepository.java +++ b/source/de/anomic/search/MetadataRepository.java @@ -40,6 +40,8 @@ import java.util.TreeSet; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -52,12 +54,8 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.CloneableIterator; import net.yacy.kelondro.table.SplitTable; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.repository.Blacklist; -//import de.anomic.http.client.Client; -//import de.anomic.http.server.ResponseContainer; - public final class MetadataRepository implements Iterable { // class objects @@ -610,7 +608,7 @@ public final class MetadataRepository implements Iterable { Map map = domainSampleCollector(); // order elements by size - ScoreCluster s = new ScoreCluster(); + DynamicScore s = new ScoreCluster(); for (Map.Entry e: map.entrySet()) { s.addScore(e.getValue().urlhash, e.getValue().count); } diff --git a/source/de/anomic/search/ReferenceOrder.java b/source/de/anomic/search/ReferenceOrder.java index 08a06b947..1ab917459 100644 --- a/source/de/anomic/search/ReferenceOrder.java +++ b/source/de/anomic/search/ReferenceOrder.java @@ -33,6 +33,8 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.Semaphore; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.document.Condenser; import net.yacy.document.LargeNumberCache; import net.yacy.kelondro.data.meta.DigestURI; @@ -42,7 +44,6 @@ import net.yacy.kelondro.data.word.WordReferenceVars; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.rwi.ReferenceContainer; -import net.yacy.kelondro.util.ScoreCluster; public class ReferenceOrder { @@ -51,7 +52,7 @@ public class ReferenceOrder { private int maxdomcount; private WordReferenceVars min, max; - private final ScoreCluster doms; // collected for "authority" heuristic + private final DynamicScore doms; // collected for "authority" heuristic private final RankingProfile ranking; private final String language; diff --git a/source/de/anomic/yacy/dht/PeerSelection.java b/source/de/anomic/yacy/dht/PeerSelection.java index 4c4f0e77b..679045a13 100755 --- a/source/de/anomic/yacy/dht/PeerSelection.java +++ b/source/de/anomic/yacy/dht/PeerSelection.java @@ -29,6 +29,8 @@ import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.index.RowSpaceExceededException; @@ -36,7 +38,6 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Digest; import net.yacy.kelondro.util.DateFormatter; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.kelondroException; import de.anomic.yacy.yacyCore; @@ -57,7 +58,7 @@ public class PeerSelection { byte[] wordhash, int redundancy, HashMap regularSeeds, - ScoreCluster ranking) { + DynamicScore ranking) { // this method is called from the search target computation long[] dhtVerticalTargets = seedDB.scheme.dhtPositions(wordhash); yacySeed seed; @@ -339,7 +340,7 @@ public class PeerSelection { if (count > seedDB.sizeConnected()) count = seedDB.sizeConnected(); // fill a score object - final ScoreCluster seedScore = new ScoreCluster(); + final DynamicScore seedScore = new ScoreCluster(); yacySeed ys; long absage; final Iterator s = seedDB.seedsConnected(true, false, null, (float) 0.0); diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index d85aa66d0..e14c5a968 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -31,10 +31,11 @@ import java.util.Map; import java.util.TreeMap; import java.util.regex.Pattern; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.repository.Blacklist; import de.anomic.crawler.ResultURLs; @@ -185,7 +186,7 @@ public class yacySearch extends Thread { } // put in seeds according to dht - final ScoreCluster ranking = new ScoreCluster(); + final DynamicScore ranking = new ScoreCluster(); final HashMap regularSeeds = new HashMap(); final HashMap matchingSeeds = new HashMap(); yacySeed seed; diff --git a/source/net/yacy/cora/storage/DynamicScore.java b/source/net/yacy/cora/storage/DynamicScore.java new file mode 100644 index 000000000..c4718c86e --- /dev/null +++ b/source/net/yacy/cora/storage/DynamicScore.java @@ -0,0 +1,31 @@ +/** + * DynamicScore + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 14.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +public interface DynamicScore extends StaticScore { + + public void incScore(final E obj); + + public void decScore(final E obj); + + public void addScore(final E obj, final int incrementScore); + +} diff --git a/source/net/yacy/cora/storage/IntScore.java b/source/net/yacy/cora/storage/IntScore.java new file mode 100644 index 000000000..bc9c785f4 --- /dev/null +++ b/source/net/yacy/cora/storage/IntScore.java @@ -0,0 +1,96 @@ +/** + * IntScore + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 14.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +import java.util.Comparator; + +/** + * This class acts as a replacement for Long and shall be used as counter object in Object-Counter relations + * The use case of this class is given when an value element of a map must be increased or decreased. If + * the normal Long class is used, the new value must be rewritten to the map with an increased and newly allocated number object + * When using this class, then only the score of the Number object can be changed without the need of + * rewriting the new key value to a map. + */ +public class IntScore implements Comparable, Comparator { + + public static IntScore ZERO = new IntScore(0); + public static IntScore ONE = new IntScore(1); + + private int value; + + public IntScore(int value) { + this.value = value; + } + + public final static IntScore valueOf(final int n) { + return new IntScore(n); + } + + public int intValue() { + return this.value; + } + + public void inc() { + this.value++; + } + + public void inc(int n) { + this.value += n; + } + + public void dec() { + this.value--; + } + + public void dec(int n) { + this.value -= n; + } + + public void set(int n) { + this.value = n; + } + + public void min(int n) { + if (n < this.value) this.value = n; + } + + public void max(int n) { + if (n > this.value) this.value = n; + } + + public boolean equals(Object o) { + return (o instanceof IntScore) && this.value == ((IntScore) o).value; + } + + public int hashCode() { + return (int) (this.value ^ (this.value >>> 32)); + } + + public int compareTo(IntScore o) { + int thisVal = this.value; + int anotherVal = o.value; + return thisVal < anotherVal ? -1 : (thisVal == anotherVal ? 0 : 1); + } + + public int compare(IntScore o1, IntScore o2) { + return o1.compareTo(o2); + } +} diff --git a/source/net/yacy/kelondro/util/ScoreCluster.java b/source/net/yacy/cora/storage/ScoreCluster.java similarity index 60% rename from source/net/yacy/kelondro/util/ScoreCluster.java rename to source/net/yacy/cora/storage/ScoreCluster.java index 6a06526eb..e2e7ba7df 100644 --- a/source/net/yacy/kelondro/util/ScoreCluster.java +++ b/source/net/yacy/cora/storage/ScoreCluster.java @@ -1,25 +1,24 @@ -// kelondroMScoreCluster.java -// ----------------------- -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 28.09.2004 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * ScoreCluster + * Copyright 2004, 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 28.09.2004 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ -package net.yacy.kelondro.util; +package net.yacy.cora.storage; import java.text.ParseException; import java.text.SimpleDateFormat; @@ -31,12 +30,13 @@ import java.util.Random; import java.util.SortedMap; import java.util.TreeMap; +import net.yacy.kelondro.util.kelondroOutOfLimitsException; -public final class ScoreCluster { +public final class ScoreCluster implements DynamicScore { - protected final TreeMap refkeyDB; // a mapping from a reference to the cluster key - protected final TreeMap keyrefDB; // a mapping from the cluster key to the reference + protected final Map map; // a mapping from a reference to the cluster key + protected final TreeMap pam; // a mapping from the cluster key to the reference private long gcount; private int encnt; @@ -45,19 +45,20 @@ public final class ScoreCluster { } public ScoreCluster(Comparator comparator) { - if(comparator != null) { - refkeyDB = new TreeMap(comparator); - } else { - refkeyDB = new TreeMap(); - } - keyrefDB = new TreeMap(); + if (comparator == null) { + //map = new HashMap(); + map = new TreeMap(comparator); + } else { + map = new TreeMap(comparator); + } + pam = new TreeMap(); gcount = 0; encnt = 0; } public synchronized void clear() { - refkeyDB.clear(); - keyrefDB.clear(); + map.clear(); + pam.clear(); gcount = 0; encnt = 0; } @@ -70,11 +71,11 @@ public final class ScoreCluster { if (maxsize < 0) return; Long key; synchronized (this) { - while (refkeyDB.size() > maxsize) { + while (map.size() > maxsize) { // find and remove smallest objects until cluster has demanded size - key = keyrefDB.firstKey(); + key = pam.firstKey(); if (key == null) break; - refkeyDB.remove(keyrefDB.remove(key)); + map.remove(pam.remove(key)); } } } @@ -87,13 +88,13 @@ public final class ScoreCluster { int score; Long key; synchronized (this) { - while (keyrefDB.size() > 0) { + while (pam.size() > 0) { // find and remove objects where their score is smaller than the demanded minimum score - key = keyrefDB.firstKey(); + key = pam.firstKey(); if (key == null) break; score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32); if (score >= minScore) break; - refkeyDB.remove(keyrefDB.remove(key)); + map.remove(pam.remove(key)); } } } @@ -178,19 +179,11 @@ public final class ScoreCluster { } public synchronized int size() { - return refkeyDB.size(); + return map.size(); } public synchronized boolean isEmpty() { - return refkeyDB.isEmpty(); - } - - public synchronized void incScore(final E[] objs) { - for (int i = 0; i < objs.length; i++) addScore(objs[i], 1); - } - - public synchronized void decScore(final E[] objs) { - for (int i = 0; i < objs.length; i++) addScore(objs[i], -1); + return map.isEmpty(); } public synchronized void incScore(final E obj) { @@ -204,7 +197,7 @@ public final class ScoreCluster { public void setScore(final E obj, final int newScore) { if (obj == null) return; synchronized (this) { - Long usk = refkeyDB.remove(obj); // get unique score key, old entry is not needed any more + Long usk = map.remove(obj); // get unique score key, old entry is not needed any more if (newScore < 0) throw new kelondroOutOfLimitsException(newScore); if (usk == null) { @@ -212,12 +205,12 @@ public final class ScoreCluster { usk = Long.valueOf(scoreKey(encnt++, newScore)); // put new value into cluster - refkeyDB.put(obj, usk); - keyrefDB.put(usk, obj); + map.put(obj, usk); + pam.put(usk, obj); } else { // delete old entry - keyrefDB.remove(usk); + pam.remove(usk); // get previous handle and score final long c = usk.longValue(); @@ -227,8 +220,8 @@ public final class ScoreCluster { // set new value usk = Long.valueOf(scoreKey(oldHandle, newScore)); // generates an unique key for a specific score - refkeyDB.put(obj, usk); - keyrefDB.put(usk, obj); + map.put(obj, usk); + pam.put(usk, obj); } } // increase overall counter @@ -238,7 +231,7 @@ public final class ScoreCluster { public void addScore(final E obj, final int incrementScore) { if (obj == null) return; synchronized (this) { - Long usk = refkeyDB.remove(obj); // get unique score key, old entry is not needed any more + Long usk = map.remove(obj); // get unique score key, old entry is not needed any more if (usk == null) { // set new value @@ -246,12 +239,12 @@ public final class ScoreCluster { usk = Long.valueOf(scoreKey(encnt++, incrementScore)); // put new value into cluster - refkeyDB.put(obj, usk); - keyrefDB.put(usk, obj); + map.put(obj, usk); + pam.put(usk, obj); } else { // delete old entry - keyrefDB.remove(usk); + pam.remove(usk); // get previous handle and score final long c = usk.longValue(); @@ -262,8 +255,8 @@ public final class ScoreCluster { final int newValue = oldScore + incrementScore; if (newValue < 0) throw new kelondroOutOfLimitsException(newValue); usk = Long.valueOf(scoreKey(oldHandle, newValue)); // generates an unique key for a specific score - refkeyDB.put(obj, usk); - keyrefDB.put(usk, obj); + map.put(obj, usk); + pam.put(usk, obj); } } // increase overall counter @@ -275,11 +268,11 @@ public final class ScoreCluster { if (obj == null) return 0; final Long usk; synchronized (this) { - usk = refkeyDB.remove(obj); // get unique score key, old entry is not needed any more + usk = map.remove(obj); // get unique score key, old entry is not needed any more if (usk == null) return 0; // delete old entry - keyrefDB.remove(usk); + pam.remove(usk); } // get previous handle and score @@ -292,61 +285,41 @@ public final class ScoreCluster { } public synchronized boolean existsScore(final E obj) { - return (refkeyDB.get(obj) != null); + return map.containsKey(obj); } public int getScore(final E obj) { if (obj == null) return 0; final Long cs; synchronized (this) { - cs = refkeyDB.get(obj); + cs = map.get(obj); } if (cs == null) return 0; return (int) ((cs.longValue() & 0xFFFFFFFF00000000L) >> 32); } public synchronized int getMaxScore() { - if (refkeyDB.isEmpty()) return -1; - return (int) ((keyrefDB.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32); + if (map.isEmpty()) return -1; + return (int) ((pam.lastKey().longValue() & 0xFFFFFFFF00000000L) >> 32); } public synchronized int getMinScore() { - if (refkeyDB.isEmpty()) return -1; - return (int) ((keyrefDB.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32); + if (map.isEmpty()) return -1; + return (int) ((pam.firstKey().longValue() & 0xFFFFFFFF00000000L) >> 32); } public synchronized E getMaxObject() { - if (refkeyDB.isEmpty()) return null; - return keyrefDB.get(keyrefDB.lastKey()); + if (map.isEmpty()) return null; + return pam.get(pam.lastKey()); } public synchronized E getMinObject() { - if (refkeyDB.isEmpty()) return null; - return keyrefDB.get(keyrefDB.firstKey()); - } - - public synchronized E[] getScores(final int maxCount, final boolean up) { - return getScores(maxCount, up, Integer.MIN_VALUE, Integer.MAX_VALUE); - } - - @SuppressWarnings("unchecked") - public synchronized E[] getScores(int maxCount, final boolean up, final int minScore, final int maxScore) { - if (maxCount > refkeyDB.size()) maxCount = refkeyDB.size(); - E[] s = (E[]) new Object[maxCount]; - final Iterator it = scores(up, minScore, maxScore); - int i = 0; - while ((i < maxCount) && (it.hasNext())) s[i++] = it.next(); - if (i < maxCount) { - // re-copy the result array - E[] sc = (E[]) new Object[i]; - System.arraycopy(s, 0, sc, 0, i); - s = sc; - } - return s; + if (map.isEmpty()) return null; + return pam.get(pam.firstKey()); } public String toString() { - return refkeyDB + " / " + keyrefDB; + return map + " / " + pam; } public synchronized Iterator scores(final boolean up) { @@ -354,66 +327,13 @@ public final class ScoreCluster { return new reverseScoreIterator(); } - public synchronized Iterator scores(final boolean up, final int minScore, final int maxScore) { - return new komplexScoreIterator(up, minScore, maxScore); - } - - private class komplexScoreIterator implements Iterator { - - boolean up; - TreeMap keyrefDBcopy; - E n; - int min, max; - - @SuppressWarnings("unchecked") - public komplexScoreIterator(final boolean up, final int minScore, final int maxScore) { - this.up = up; - this.min = minScore; - this.max = maxScore; - this.keyrefDBcopy = (TreeMap) keyrefDB.clone(); // NoSuchElementException here? - internalNext(); - } - - public boolean hasNext() { - return (n != null); - } - - private void internalNext() { - Long key; - int score = (max + min) / 2; - while (!keyrefDBcopy.isEmpty()) { - key = ((up) ? keyrefDBcopy.firstKey() : keyrefDBcopy.lastKey()); - n = keyrefDBcopy.remove(key); - score = (int) ((key.longValue() & 0xFFFFFFFF00000000L) >> 32); - if ((score >= min) && (score <= max)) return; - if (((up) && (score > max)) || ((!(up)) && (score < min))) { - keyrefDBcopy = new TreeMap(); - n = null; - return; - } - } - n = null; - } - - public E next() { - final E o = n; - internalNext(); - return o; - } - - public void remove() { - if (n != null) deleteScore(n); - } - - } - private class reverseScoreIterator implements Iterator { SortedMap view; Long key; public reverseScoreIterator() { - view = keyrefDB; + view = pam; } public boolean hasNext() { @@ -423,14 +343,14 @@ public final class ScoreCluster { public E next() { key = view.lastKey(); view = view.headMap(key); - final E value = keyrefDB.get(key); + final E value = pam.get(key); //System.out.println("cluster reverse iterator: score = " + ((((Long) key).longValue() & 0xFFFFFFFF00000000L) >> 32) + ", handle = " + (((Long) key).longValue() & 0xFFFFFFFFL) + ", value = " + value); return value; } public void remove() { - final Object val = keyrefDB.remove(key); - if (val != null) refkeyDB.remove(val); + final Object val = pam.remove(key); + if (val != null) map.remove(val); } } @@ -441,7 +361,7 @@ public final class ScoreCluster { Map.Entry entry; public simpleScoreIterator() { - ii = keyrefDB.entrySet().iterator(); + ii = pam.entrySet().iterator(); } public boolean hasNext() { @@ -456,7 +376,7 @@ public final class ScoreCluster { public void remove() { ii.remove(); - if (entry.getValue() != null) refkeyDB.remove(entry.getValue()); + if (entry.getValue() != null) map.remove(entry.getValue()); } } @@ -502,24 +422,12 @@ public final class ScoreCluster { } System.out.println("result:"); - Object[] result; - result = s.getScores(s.size(), true); - for (int i = 0; i < s.size(); i++) System.out.println("up: " + result[i]); - result = s.getScores(s.size(), false); - for (int i = 0; i < s.size(); i++) System.out.println("down: " + result[i]); + Iterator i = s.scores(true); + while (i.hasNext()) System.out.println("up: " + i.next()); + i = s.scores(false); + while (i.hasNext()) System.out.println("down: " + i.next()); System.out.println("finished create. time = " + (System.currentTimeMillis() - time)); System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c); - - /* - // delete cluster - time = System.currentTimeMillis(); - for (int i = 0; i < 10000; i++) { - s.deleteScore("score#" + i + "xxx" + i + "xxx" + i + "xxx" + i + "xxx"); - c -= i/10; - } - System.out.println("finished delete. time = " + (System.currentTimeMillis() - time)); - System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c); - */ } } diff --git a/source/net/yacy/cora/storage/ScoreMap.java b/source/net/yacy/cora/storage/ScoreMap.java new file mode 100644 index 000000000..dad73bfb0 --- /dev/null +++ b/source/net/yacy/cora/storage/ScoreMap.java @@ -0,0 +1,265 @@ +/** + * ScoreMap + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 14.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.TreeSet; + + +public class ScoreMap implements DynamicScore { + + protected final Map map; // a mapping from a reference to the cluster key + private long gcount; + + public ScoreMap() { + this(null); + } + + public ScoreMap(Comparator comparator) { + if (comparator == null) { + map = new HashMap(); + } else { + map = new TreeMap(comparator); + } + gcount = 0; + } + + public synchronized void clear() { + map.clear(); + gcount = 0; + } + + /** + * shrink the cluster to a demanded size + * @param maxsize + */ + public void shrinkToMaxSize(int maxsize) { + if (this.map.size() <= maxsize) return; + int minScore = getMinScore(); + while (this.map.size() > maxsize) { + minScore++; + shrinkToMinScore(minScore); + } + } + + /** + * shrink the cluster in such a way that the smallest score is equal or greater than a given minScore + * @param minScore + */ + public void shrinkToMinScore(int minScore) { + synchronized (this) { + Iterator> i = this.map.entrySet().iterator(); + Map.Entry entry; + while (i.hasNext()) { + entry = i.next(); + if (entry.getValue().intValue() < minScore) i.remove(); + } + } + } + + public synchronized long totalCount() { + return gcount; + } + + public synchronized int size() { + return map.size(); + } + + public synchronized boolean isEmpty() { + return map.isEmpty(); + } + + public void incScore(final E obj) { + if (obj == null) return; + synchronized (this) { + IntScore score = this.map.get(obj); + if (score == null) { + this.map.put(obj, IntScore.ONE); + } else { + score.inc(); + } + } + // increase overall counter + gcount++; + } + + public void decScore(final E obj) { + if (obj == null) return; + synchronized (this) { + IntScore score = this.map.get(obj); + if (score == null) { + this.map.put(obj, IntScore.valueOf(-1)); + } else { + score.dec(); + } + } + // increase overall counter + gcount--; + } + + public void setScore(final E obj, final int newScore) { + if (obj == null) return; + synchronized (this) { + IntScore score = this.map.get(obj); + if (score == null) { + this.map.put(obj, IntScore.ONE); + } else { + gcount -= score.intValue(); + score.set(newScore); + } + } + // increase overall counter + gcount += newScore; + } + + public void addScore(final E obj, final int incrementScore) { + if (obj == null) return; + synchronized (this) { + IntScore score = this.map.get(obj); + if (score == null) { + this.map.put(obj, IntScore.valueOf(incrementScore)); + } else { + score.inc(incrementScore); + } + } + // increase overall counter + gcount += incrementScore; + } + + public int deleteScore(final E obj) { + // deletes entry and returns previous score + if (obj == null) return 0; + final IntScore score; + synchronized (this) { + score = map.remove(obj); + if (score == null) return 0; + } + + // decrease overall counter + gcount -= score.intValue(); + return score.intValue(); + } + + public synchronized boolean existsScore(final E obj) { + return map.containsKey(obj); + } + + public int getScore(final E obj) { + if (obj == null) return 0; + final IntScore score; + synchronized (this) { + score = map.get(obj); + } + if (score == null) return 0; + return score.intValue(); + } + + public int getMaxScore() { + if (map.isEmpty()) return -1; + int maxScore = Integer.MIN_VALUE; + synchronized (this) { + for (Map.Entry entry: this.map.entrySet()) if (entry.getValue().intValue() > maxScore) { + maxScore = entry.getValue().intValue(); + } + } + return maxScore; + } + + public int getMinScore() { + if (map.isEmpty()) return -1; + int minScore = Integer.MAX_VALUE; + synchronized (this) { + for (Map.Entry entry: this.map.entrySet()) if (entry.getValue().intValue() < minScore) { + minScore = entry.getValue().intValue(); + } + } + return minScore; + } + + public E getMaxObject() { + if (map.isEmpty()) return null; + E maxObject = null; + int maxScore = Integer.MIN_VALUE; + synchronized (this) { + for (Map.Entry entry: this.map.entrySet()) if (entry.getValue().intValue() > maxScore) { + maxScore = entry.getValue().intValue(); + maxObject = entry.getKey(); + } + } + return maxObject; + } + + public E getMinObject() { + if (map.isEmpty()) return null; + E minObject = null; + int minScore = Integer.MAX_VALUE; + synchronized (this) { + for (Map.Entry entry: this.map.entrySet()) if (entry.getValue().intValue() < minScore) { + minScore = entry.getValue().intValue(); + minObject = entry.getKey(); + } + } + return minObject; + } + + public String toString() { + return map.toString(); + } + + public Iterator scores(boolean up) { + synchronized (this) { + // re-organize entries + TreeMap> m = new TreeMap>(); + Set s; + for (Map.Entry entry: this.map.entrySet()) { + s = m.get(entry.getValue()); + if (s == null) { + s = this.map instanceof TreeMap ? new TreeSet(((TreeMap) this.map).comparator()) : new HashSet(); + s.add(entry.getKey()); + m.put(entry.getValue(), s); + } else { + s.add(entry.getKey()); + } + } + + // flatten result + List l = new ArrayList(this.map.size()); + for (Set f: m.values()) { + for (E e: f) l.add(e); + } + if (up) return l.iterator(); + + // optionally reverse list + List r = new ArrayList(l.size()); + for (int i = l.size() - 1; i >= 0; i--) r.add(r.get(i)); + return r.iterator(); + } + } + +} diff --git a/source/net/yacy/cora/storage/StaticScore.java b/source/net/yacy/cora/storage/StaticScore.java new file mode 100644 index 000000000..211f09bf0 --- /dev/null +++ b/source/net/yacy/cora/storage/StaticScore.java @@ -0,0 +1,67 @@ +/** + * StaticScore + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 14.10.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.storage; + +import java.util.Iterator; + +public interface StaticScore { + + public void clear(); + + /** + * shrink the cluster to a demanded size + * @param maxsize + */ + public void shrinkToMaxSize(int maxsize); + + /** + * shrink the cluster in such a way that the smallest score is equal or greater than a given minScore + * @param minScore + */ + public void shrinkToMinScore(int minScore); + + public long totalCount(); + + public int size(); + + public boolean isEmpty(); + + public void setScore(final E obj, final int newScore); + + public int deleteScore(final E obj); + + public boolean existsScore(final E obj); + + public int getScore(final E obj); + + public int getMaxScore(); + + public int getMinScore(); + + public E getMaxObject(); + + public E getMinObject(); + + public String toString(); + + public Iterator scores(final boolean up); + +} diff --git a/source/net/yacy/kelondro/blob/MapDataMining.java b/source/net/yacy/kelondro/blob/MapDataMining.java index 877a8c981..607274255 100644 --- a/source/net/yacy/kelondro/blob/MapDataMining.java +++ b/source/net/yacy/kelondro/blob/MapDataMining.java @@ -34,12 +34,13 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import net.yacy.cora.storage.ScoreCluster; +import net.yacy.cora.storage.StaticScore; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.ByteOrder; import net.yacy.kelondro.order.CloneableIterator; import net.yacy.kelondro.util.LookAheadIterator; -import net.yacy.kelondro.util.ScoreCluster; public class MapDataMining extends MapHeap { @@ -48,7 +49,7 @@ public class MapDataMining extends MapHeap { private final static Double DOUBLE0 = Double.valueOf(0.0); private final String[] sortfields, longaccfields, doubleaccfields; - private Map> sortClusterMap; // a String-kelondroMScoreCluster - relation + private Map> sortClusterMap; // a String-kelondroMScoreCluster - relation private Map accLong; // to store accumulations of Long cells private Map accDouble; // to store accumulations of Double cells @@ -71,7 +72,7 @@ public class MapDataMining extends MapHeap { ScoreCluster[] cluster = null; if (sortfields == null) sortClusterMap = null; else { - sortClusterMap = new ConcurrentHashMap>(); + sortClusterMap = new ConcurrentHashMap>(); cluster = new ScoreCluster[sortfields.length]; for (int i = 0; i < sortfields.length; i++) { cluster[i] = new ScoreCluster(); @@ -154,7 +155,7 @@ public class MapDataMining extends MapHeap { public synchronized void clear() { super.clear(); if (sortfields == null) sortClusterMap = null; else { - sortClusterMap = new HashMap>(); + sortClusterMap = new HashMap>(); for (int i = 0; i < sortfields.length; i++) { sortClusterMap.put(sortfields[i], new ScoreCluster()); } @@ -240,7 +241,7 @@ public class MapDataMining extends MapHeap { private void updateSortCluster(final String key, final Map map) { Object cell; - ScoreCluster cluster; + StaticScore cluster; for (int i = 0; i < sortfields.length; i++) { cell = map.get(sortfields[i]); if (cell != null) { @@ -278,7 +279,7 @@ public class MapDataMining extends MapHeap { private void deleteSortCluster(final String key) { if (key == null) return; - ScoreCluster cluster; + StaticScore cluster; for (int i = 0; i < sortfields.length; i++) { cluster = sortClusterMap.get(sortfields[i]); cluster.deleteScore(key); @@ -289,7 +290,7 @@ public class MapDataMining extends MapHeap { public synchronized Iterator keys(final boolean up, /* sorted by */ final String field) { // sorted iteration using the sortClusters if (sortClusterMap == null) return null; - final ScoreCluster cluster = sortClusterMap.get(field); + final StaticScore cluster = sortClusterMap.get(field); if (cluster == null) return null; // sort field does not exist //System.out.println("DEBUG: cluster for field " + field + ": " + cluster.toString()); return new string2bytearrayIterator(cluster.scores(up)); diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index 40b88df66..8d8b3add2 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -49,6 +49,8 @@ import java.util.zip.ZipOutputStream; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.http.HTTPClient; +import net.yacy.cora.storage.DynamicScore; +import net.yacy.cora.storage.ScoreCluster; import net.yacy.gui.YaCyApp; import net.yacy.gui.framework.Browser; import net.yacy.kelondro.blob.MapDataMining; @@ -65,7 +67,6 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.MemoryControl; -import net.yacy.kelondro.util.ScoreCluster; import net.yacy.kelondro.util.OS; //import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; @@ -606,7 +607,7 @@ public final class yacy { final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true); File f; byte[] h; - final ScoreCluster hs = new ScoreCluster(); + final DynamicScore hs = new ScoreCluster(Base64Order.standardCoder); while (ef.hasMoreElements()) { f = ef.nextElement(); h = f.getName().substring(0, Word.commonHashLength).getBytes();