From 5892fff51fa0ab24a88e7e82c9e77700ba3edd37 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 13 Feb 2011 17:37:28 +0000 Subject: [PATCH] introduction of dht-burst modes: this can expand the number of target peers in some cases where a better heuristic is needed. The problematic cases are either when a muti-word search is made (still a hard case for our term-oriented DHT) or when a network operator wants that all robinson peers are asked. We therefore introduced two new network steering values that switch on more peers during the peer selection. Because the number of peers can now be very large, the number of maximum httpc connections was also increased. Please see new coments in yacy.network.freeworld.unit for details of the new DHT selection methods. The number of maximum peers is now not fixed to a specific number but may increase with - the partition exponent - the number of redundant peers - the robinson burst percentage - the multiword burst percentage The maximum can then be the number of senior peers (all visible peers). git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7479 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.network.freeworld.unit | 53 +++++- htroot/yacy/search.java | 3 +- htroot/yacysearch.java | 2 +- source/de/anomic/data/YMarkTables.java | 10 +- source/de/anomic/search/SearchEvent.java | 15 +- source/de/anomic/search/SearchEventCache.java | 6 +- source/de/anomic/search/Segment.java | 7 +- source/de/anomic/search/Switchboard.java | 21 +-- .../anomic/search/SwitchboardConstants.java | 4 + .../de/anomic/yacy/dht/PartitionScheme.java | 2 +- source/de/anomic/yacy/dht/PeerSelection.java | 156 ++++++++++++------ source/de/anomic/yacy/yacyClient.java | 19 +-- source/de/anomic/yacy/yacySearch.java | 120 ++------------ source/de/anomic/yacy/yacySeed.java | 23 ++- .../yacy/cora/protocol/http/HTTPClient.java | 2 +- source/net/yacy/document/Condenser.java | 12 +- source/net/yacy/document/WordTokenizer.java | 4 +- 17 files changed, 237 insertions(+), 222 deletions(-) diff --git a/defaults/yacy.network.freeworld.unit b/defaults/yacy.network.freeworld.unit index 120370c7d..a484ec455 100644 --- a/defaults/yacy.network.freeworld.unit +++ b/defaults/yacy.network.freeworld.unit @@ -6,17 +6,68 @@ # this is a work in progress. disabled properties are not yet used # # -----------------------------------------------------------------# -# general network definition +# define the name of the nework +# this nickname is also used to identifiy network requests network.unit.name = freeworld + +# the visible name of the network network.unit.description = Public YaCy Community + +# definition of the content domain: possible values are: +# global, local, any network.unit.domain = global + +# maximum search time for remote queries (deprecated) network.unit.search.time = 4 + +# flag to switch on dht transmission +# if the dht transmission is set to 'false' then for a global +# query all targets are accessed network.unit.dht = true + +# the number of redundant target peers: +# redundant peers get a copy of the original dht target information network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 3 + +# the vertical partition of the dht: this applies a division +# of the dht into 2^^ fragments which get +# all the same word-partition targets but a document-dht computed +# fragment of all documents network.unit.dht.partitionExponent = 4 + +# network request burst attributes: this enables non-dht target +# positions for certain situations. This is not a 'traditional' burst-mode +# since it does not refer to a handshake to a single client but it refers +# to not-handshaking in a distributed way. It means to get data without using +# a dht transmission logic. + +# robinson burst: percentage of the number of robinson peers that +# shall be accessed for every search. This includes also robinson peers +# that do not have a matching peer tag. If this is set to 100 then all robinson +# peers are always asked +network.unit.dht.burst.robinson = 50 + +# multi-word burst: percentage of the number of all peers that +# shall be accessed for multi-word searches. Multi-word search is +# a hard problem when the distributed search network is divided by +# term (as done with yacy, partly..). +# Scientific solutions for this problem is to apply heuristics. +# This heuristic enables to switch on a full network scan to get also +# non-distributed multi-word positions. For a full scan set this value to 100. +# Attention: this may out-number the maxcount of available httpc network connections. +network.unit.dht.burst.multiword = 30 + +# switch to enable verification of search results +# must be set to true in untrusted networks and can be +# set to false in completely trusted networks network.unit.inspection.searchverify = true + +# speed of remote crawl de-queueing. this is the number of milliseconds +# as a pause between two requests network.unit.remotecrawl.speed = 300 + +# addresses of seed-list bootstrap locations network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index bb1a5f4ec..2af71e16c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -61,6 +61,7 @@ import de.anomic.search.Segment; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.search.ResultEntry; +import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -294,7 +295,7 @@ public final class search { yacyChannel.channels(yacyChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); // make event - theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader); + theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); // set statistic details of search result and find best result index set joincount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index cb870ec22..bb03e6dc2 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -520,7 +520,7 @@ public class yacysearch { theQuery.setOffset(0); // in case that this is a new search, always start without a offset offset = 0; } - final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); + final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search if (offset == 0) { diff --git a/source/de/anomic/data/YMarkTables.java b/source/de/anomic/data/YMarkTables.java index c9a889668..05679b1aa 100644 --- a/source/de/anomic/data/YMarkTables.java +++ b/source/de/anomic/data/YMarkTables.java @@ -501,13 +501,9 @@ public class YMarkTables { } public static TreeMap getWordCounts(final Document document) { - try { - if(document != null) { - return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); - } - } catch (IOException e) { - Log.logException(e); - } + if (document != null) { + return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); + } return new TreeMap(); } diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 804972835..a869bb272 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -86,7 +86,9 @@ public final class SearchEvent { final WorkTables workTables, final SortedMap preselectedPeerHashes, final boolean generateAbstracts, - final LoaderDispatcher loader) { + final LoaderDispatcher loader, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { this.eventTime = System.currentTimeMillis(); // for lifetime check this.peers = peers; this.workTables = workTables; @@ -106,9 +108,7 @@ public final class SearchEvent { boolean remote = (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) || (query.domType == QueryParams.SEARCHDOM_CLUSTERALL); if (remote && peers.sizeConnected() == 0) remote = false; final long start = System.currentTimeMillis(); - if (remote) { - final int fetchpeers = 32; - + if (remote) { // initialize a ranking process that is the target for data // that is generated concurrently from local and global search threads this.rankingProcess = new RankingProcess(this.query, this.order, max_results_preparation); @@ -118,7 +118,6 @@ public final class SearchEvent { // start global searches final long timer = System.currentTimeMillis(); - Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); this.primarySearchThreads = (query.queryHashes.isEmpty()) ? null : yacySearch.primaryRemoteSearches( QueryParams.hashSet2hashString(query.queryHashes), QueryParams.hashSet2hashString(query.excludeHashes), @@ -133,11 +132,13 @@ public final class SearchEvent { peers, rankingProcess, secondarySearchSuperviser, - fetchpeers, Switchboard.urlBlacklist, query.ranking, query.constraint, - (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); + (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes, + burstRobinsonPercent, + burstMultiwordPercent); + Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); if (this.primarySearchThreads != null) { this.rankingProcess.moreFeeders(this.primarySearchThreads.length); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false); diff --git a/source/de/anomic/search/SearchEventCache.java b/source/de/anomic/search/SearchEventCache.java index b821cea74..194cb89dc 100644 --- a/source/de/anomic/search/SearchEventCache.java +++ b/source/de/anomic/search/SearchEventCache.java @@ -104,7 +104,9 @@ public class SearchEventCache { final WorkTables workTables, final SortedMap preselectedPeerHashes, final boolean generateAbstracts, - final LoaderDispatcher loader) { + final LoaderDispatcher loader, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { String id = query.id(false); SearchEvent event = SearchEventCache.lastEvents.get(id); @@ -126,7 +128,7 @@ public class SearchEventCache { } if (event == null) { // start a new event - event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader); + event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, burstRobinsonPercent, burstMultiwordPercent); } return event; diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 01baea82b..4861d7857 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -28,7 +28,6 @@ package de.anomic.search; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.Iterator; import java.util.Map; @@ -423,11 +422,7 @@ public class Segment { } // get the word set Set words = null; - try { - words = new Condenser(document, true, true, null).words().keySet(); - } catch (final UnsupportedEncodingException e) { - Log.logException(e); - } + words = new Condenser(document, true, true, null).words().keySet(); // delete all word references int count = 0; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index ce74e58ab..cc44c554c 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -49,7 +49,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; import java.security.PublicKey; @@ -1834,19 +1833,13 @@ public final class Switchboard extends serverSwitch { Condenser[] condenser = new Condenser[in.documents.length]; if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'"); for (int i = 0; i < in.documents.length; i++) { - // strip out words and generate statistics - try { - condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib); - - // update image result list statistics - // its good to do this concurrently here, because it needs a DNS lookup - // to compute a URL hash which is necessary for a double-check - final CrawlProfile profile = in.queueEntry.profile(); - ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); - - } catch (final UnsupportedEncodingException e) { - return null; - } + condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib); + + // update image result list statistics + // its good to do this concurrently here, because it needs a DNS lookup + // to compute a URL hash which is necessary for a double-check + final CrawlProfile profile = in.queueEntry.profile(); + ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); } return new indexingQueueEntry(in.process, in.queueEntry, in.documents, condenser); } diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index fe3c0fe65..f0f006c8b 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -271,6 +271,10 @@ public final class SwitchboardConstants { public static final String CLUSTER_MODE_PRIVATE_CLUSTER = "privatecluster"; public static final String CLUSTER_MODE_PUBLIC_PEER = "publicpeer"; public static final String CLUSTER_PEERS_IPPORT = "cluster.peers.ipport"; + + public static final String DHT_BURST_ROBINSON = "network.unit.dht.burst.robinson"; + public static final String DHT_BURST_MULTIWORD = "network.unit.dht.burst.multiword"; + /** *

public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

*

Name of the setting how many active crawler-threads may maximal be running on the same time

diff --git a/source/de/anomic/yacy/dht/PartitionScheme.java b/source/de/anomic/yacy/dht/PartitionScheme.java index c897f41b0..de35fb832 100755 --- a/source/de/anomic/yacy/dht/PartitionScheme.java +++ b/source/de/anomic/yacy/dht/PartitionScheme.java @@ -65,7 +65,7 @@ public interface PartitionScheme { public long dhtPosition(final byte[] wordHash, final int verticalPosition); public int verticalPosition(final byte[] urlHash); - + public long[] dhtPositions(final byte[] wordHash); public long dhtDistance(final byte[] word, final String urlHash, final yacySeed peer); diff --git a/source/de/anomic/yacy/dht/PeerSelection.java b/source/de/anomic/yacy/dht/PeerSelection.java index 7b13063bf..4795049d1 100755 --- a/source/de/anomic/yacy/dht/PeerSelection.java +++ b/source/de/anomic/yacy/dht/PeerSelection.java @@ -26,9 +26,12 @@ package de.anomic.yacy.dht; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.SortedMap; import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.storage.DynamicScore; @@ -53,13 +56,109 @@ import de.anomic.yacy.yacyVersion; */ public class PeerSelection { - - public static void selectDHTPositions( + + public static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap peerhashes) { + final Iterator> i = peerhashes.entrySet().iterator(); + final List l = new ArrayList(); + Map.Entry entry; + yacySeed s; + while (i.hasNext()) { + entry = i.next(); + s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time + if (s != null) { + s.setAlternativeAddress(entry.getValue()); + l.add(s); + } + } + return l.toArray(new yacySeed[l.size()]); + } + + public static yacySeed[] selectSearchTargets( + final yacySeedDB seedDB, + final HandleSet wordhashes, + int redundancy, + int burstRobinsonPercent, + int burstMultiwordPercent) { + // find out a specific number of seeds, that would be relevant for the given word hash(es) + // the result is ordered by relevance: [0] is most relevant + // the seedcount is the maximum number of wanted results + if (seedDB == null) { return null; } + + // put in seeds according to dht + final Map regularSeeds = new HashMap(); // dht position seeds + yacySeed seed; + Iterator dhtEnum; + Iterator iter = wordhashes.iterator(); + while (iter.hasNext()) { + selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds); + } + //int minimumseeds = Math.min(seedDB.scheme.verticalPartitions(), regularSeeds.size()); // that should be the minimum number of seeds that are returned + //int maximumseeds = seedDB.scheme.verticalPartitions() * redundancy; // this is the maximum number of seeds according to dht and heuristics. It can be more using burst mode. + + // put in some seeds according to size of peer. + // But not all, that would produce too much load on the largest peers + dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT); + int c = Math.max(Math.min(5, seedDB.sizeConnected()), wordhashes.size() > 1 ? seedDB.sizeConnected() * burstMultiwordPercent / 100 : 0); + while (dhtEnum.hasNext() && c-- > 0) { + seed = dhtEnum.next(); + if (seed == null) continue; + if (seed.getAge() < 1) { // the 'workshop feature' + Log.logInfo("DHT", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge()); + regularSeeds.put(seed.hash, seed); + continue; + } + if (Math.random() * 100 + (wordhashes.size() > 1 ? burstMultiwordPercent : 25) >= 50) { + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/CountBurst: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount()); + regularSeeds.put(seed.hash, seed); + continue; + } + } + + // create a set that contains only robinson peers because these get a special handling + dhtEnum = seedDB.seedsConnected(true, false, null, 0.50f); + Set robinson = new HashSet(); + while (dhtEnum.hasNext()) { + seed = dhtEnum.next(); + if (seed == null) continue; + if (!seed.getFlagAcceptRemoteIndex()) robinson.add(seed); + } + + // add robinson peers according to robinson burst rate + dhtEnum = robinson.iterator(); + c = robinson.size() * burstRobinsonPercent / 100; + while (dhtEnum.hasNext() && c-- > 0) { + seed = dhtEnum.next(); + if (Math.random() * 100 + burstRobinsonPercent >= 100) { + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/RobinsonBurst: " + seed.hash + ":" + seed.getName()); + regularSeeds.put(seed.hash, seed); + continue; + } + } + + // put in seeds that are public robinson peers and where the peer tags match with query + // or seeds that are newbies to ensure that private demonstrations always work + dhtEnum = robinson.iterator(); + while (dhtEnum.hasNext()) { + seed = dhtEnum.next(); + if (seed.matchPeerTags(wordhashes)) { + // peer tags match + String specialized = seed.getPeerTags().toString(); + if (!specialized.equals("[*]")) Log.logInfo("DHT", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); + regularSeeds.put(seed.hash, seed); + } + } + + // produce return set + yacySeed[] result = new yacySeed[regularSeeds.size()]; + result = regularSeeds.values().toArray(result); + return result; + } + + private static void selectDHTPositions( final yacySeedDB seedDB, byte[] wordhash, int redundancy, - Map regularSeeds, - DynamicScore ranking) { + Map regularSeeds) { // this method is called from the search target computation final long[] dhtVerticalTargets = seedDB.scheme.dhtPositions(wordhash); yacySeed seed; @@ -72,50 +171,13 @@ public class PeerSelection { seed = dhtEnum.next(); if (seed == null || seed.hash == null) continue; if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer - if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c); - ranking.inc(seed.hash, 2 * c); + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c); regularSeeds.put(seed.hash, seed); c--; } } } - - private static int guessedOwn = 0; - - public static boolean shallBeOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash, final int redundancy) { - // the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct - if (guessIfOwnWord(seedDB, wordhash, urlhash)) { - // this case must be verified, because it can be wrong. - guessedOwn++; - return verifyIfOwnWord(seedDB, wordhash, urlhash, redundancy); - } else { - return false; - } - - } - - private static boolean guessIfOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash) { - if (seedDB == null) return false; - int connected = seedDB.sizeConnected(); - if (connected == 0) return true; - final long target = seedDB.scheme.dhtPosition(wordhash, urlhash); - final long mypos = seedDB.scheme.dhtPosition(seedDB.mySeed().hash.getBytes(), urlhash); - long distance = FlatWordPartitionScheme.dhtDistance(target, mypos); - if (distance <= 0) return false; - if (distance <= Long.MAX_VALUE / connected * 2) return true; - return false; - } - - private static boolean verifyIfOwnWord(final yacySeedDB seedDB, byte[] wordhash, String urlhash, int redundancy) { - String myHash = seedDB.mySeed().hash; - wordhash = FlatWordPartitionScheme.positionToHash(seedDB.scheme.dhtPosition(wordhash, urlhash)); - final Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true); - while (dhtEnum.hasNext()) { - if (dhtEnum.next().hash.equals(myHash)) return true; - } - return false; - } - + public static byte[] selectTransferStart() { return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(2, 2 + Word.commonHashLength).getBytes(); } @@ -131,7 +193,7 @@ public class PeerSelection { final byte[] starthash, int max, boolean alsoMyOwn) { - final Iterator seedIter = PeerSelection.getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn); + final Iterator seedIter = getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn); final ArrayList targets = new ArrayList(); while (seedIter.hasNext() && max-- > 0) targets.add(seedIter.next()); return targets; @@ -159,7 +221,7 @@ public class PeerSelection { private int remaining; private boolean alsoMyOwn; - public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) { + private acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) { this.seedDB = seedDB; this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX); this.remaining = max; @@ -238,7 +300,7 @@ public class PeerSelection { private float minVersion; private yacySeedDB seedDB; - public seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) { + private seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) { this.seedDB = seedDB; this.steps = seedDB.sizeConnected(); this.minVersion = minVersion; @@ -290,7 +352,7 @@ public class PeerSelection { private yacySeed nextSeed; private yacySeedDB seedDB; - public providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) { + private providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) { this.seedDB = seedDB; se = getDHTSeeds(seedDB, null, yacyVersion.YACY_POVIDES_REMOTECRAWL_LISTS); nextSeed = nextInternal(); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 0c2e09395..0a6ad6806 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -422,7 +422,7 @@ public final class yacyClient { sitehash, authorhash, count, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(), secondarySearchSuperviser, rankingProfile, constraint); } catch (final IOException e) { - yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); + yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + ")"); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); return -1; } @@ -553,17 +553,12 @@ public final class yacyClient { } // generate statistics - if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH " - + result.urlcount - + " URLS FROM " - + target.hash - + ":" - + target.getName() - + ", score=" - + target.selectscore - + ", searchtime=" + result.searchtime + ", netdelay=" - + (totalrequesttime - result.searchtime) + ", references=" - + result.references); + if (yacyCore.log.isFine()) yacyCore.log.logFine( + "SEARCH " + result.urlcount + + " URLS FROM " + target.hash + ":" + target.getName() + + ", searchtime=" + result.searchtime + + ", netdelay=" + (totalrequesttime - result.searchtime) + + ", references=" + result.references); return result.urlcount; } diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 9704677af..089b2f51e 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -24,16 +24,10 @@ package de.anomic.yacy; -import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; -import java.util.List; -import java.util.Map; import java.util.SortedMap; import java.util.regex.Pattern; -import net.yacy.cora.storage.DynamicScore; -import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; @@ -151,104 +145,7 @@ public class yacySearch extends Thread { public yacySeed target() { return targetPeer; } - - private static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap peerhashes) { - final Iterator> i = peerhashes.entrySet().iterator(); - final List l = new ArrayList(); - Map.Entry entry; - yacySeed s; - while (i.hasNext()) { - entry = i.next(); - s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time - if (s != null) { - s.setAlternativeAddress(entry.getValue()); - l.add(s); - } - } -// final yacySeed[] result = new yacySeed[l.size()]; -// for (int j = 0; j < l.size(); j++) { -// result[j] = l.get(j); -// } -// return result; - return l.toArray(new yacySeed[0]); - } - private static yacySeed[] selectSearchTargets(final yacySeedDB seedDB, final HandleSet wordhashes, int seedcount, int redundancy) { - // find out a specific number of seeds, that would be relevant for the given word hash(es) - // the result is ordered by relevance: [0] is most relevant - // the seedcount is the maximum number of wanted results - if (seedDB == null) { return null; } - if ((seedcount >= seedDB.sizeConnected()) || (seedDB.noDHTActivity())) { - seedcount = seedDB.sizeConnected(); - } - - // put in seeds according to dht - final DynamicScore ranking = new ScoreCluster(); - final Map regularSeeds = new HashMap(); - final Map matchingSeeds = new HashMap(); - yacySeed seed; - Iterator dhtEnum; - Iterator iter = wordhashes.iterator(); - while (iter.hasNext()) { - PeerSelection.selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds, ranking); - } - - // put in seeds according to size of peer - dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT); - int c = Math.min(seedDB.sizeConnected(), seedcount); - int score; - while (dhtEnum.hasNext() && c > 0) { - seed = dhtEnum.next(); - if (seed == null) continue; - if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer - score = (int) Math.round(Math.random() * ((c / 3) + 3)); - if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/RWIcount: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount() + ", score " + score); - ranking.inc(seed.hash, score); - regularSeeds.put(seed.hash, seed); - c--; - } - - // put in seeds that are public robinson peers and where the peer tags match with query - // or seeds that are newbies to ensure that public demonstrations always work - dhtEnum = seedDB.seedsConnected(true, false, null, (float) 0.50); - while (dhtEnum.hasNext()) { - seed = dhtEnum.next(); - if (seed == null) continue; - if (seed.matchPeerTags(wordhashes)) { - String specialized = seed.getPeerTags().toString(); - if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); - regularSeeds.remove(seed.hash); - ranking.delete(seed.hash); - matchingSeeds.put(seed.hash, seed); - } else if (seed.getFlagAcceptRemoteIndex() && seed.getAge() < 1) { // the 'workshop feature' - Log.logInfo("PLASMA", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge()); - regularSeeds.remove(seed.hash); - ranking.delete(seed.hash); - matchingSeeds.put(seed.hash, seed); - } - } - - // evaluate the ranking score and select seeds - seedcount = Math.min(ranking.size(), seedcount); - final yacySeed[] result = new yacySeed[seedcount + matchingSeeds.size()]; - c = 0; - final Iterator iters = ranking.keys(false); // higher are better - while (iters.hasNext() && c < seedcount) { - seed = regularSeeds.get(iters.next()); - seed.selectscore = c; - Log.logInfo("PLASMA", "selectPeers/_dht_: " + seed.hash + ":" + seed.getName() + " is choice " + c); - result[c++] = seed; - } - for (final yacySeed s: matchingSeeds.values()) { - s.selectscore = c; - Log.logInfo("PLASMA", "selectPeers/_match_: " + s.hash + ":" + s.getName() + " is choice " + c); - result[c++] = s; - } - -// System.out.println("DEBUG yacySearch.selectPeers = " + seedcount + " seeds:"); for (int i = 0; i < seedcount; i++) System.out.println(" #" + i + ":" + result[i]); // debug - return result; - } - public static yacySearch[] primaryRemoteSearches( final String wordhashes, final String excludehashes, final Pattern prefer, final Pattern filter, String language, @@ -259,11 +156,12 @@ public class yacySearch extends Thread { final yacySeedDB peers, final RankingProcess containerCache, final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, - int targets, final Blacklist blacklist, final RankingProfile rankingProfile, final Bitfield constraint, - final SortedMap clusterselection) { + final SortedMap clusterselection, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { // check own peer status //if (wordIndex.seedDB.mySeed() == null || wordIndex.seedDB.mySeed().getPublicAddress() == null) { return null; } @@ -272,14 +170,15 @@ public class yacySearch extends Thread { assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes; final yacySeed[] targetPeers = (clusterselection == null) ? - selectSearchTargets( + PeerSelection.selectSearchTargets( peers, QueryParams.hashes2Set(wordhashes), - targets, - peers.redundancy()) - : selectClusterPeers(peers, clusterselection); + peers.redundancy(), + burstRobinsonPercent, + burstMultiwordPercent) + : PeerSelection.selectClusterPeers(peers, clusterselection); if (targetPeers == null) return new yacySearch[0]; - targets = targetPeers.length; + int targets = targetPeers.length; if (targets == 0) return new yacySearch[0]; final yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { @@ -292,6 +191,7 @@ public class yacySearch extends Thread { indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); searchThreads[i].start(); } catch (OutOfMemoryError e) { + e.printStackTrace(); break; } } diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index 772469525..a4148554c 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -47,6 +47,7 @@ import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -71,7 +72,7 @@ import de.anomic.tools.bitfield; import de.anomic.tools.crypt; import de.anomic.yacy.dht.FlatWordPartitionScheme; -public class yacySeed implements Cloneable { +public class yacySeed implements Cloneable, Comparable, Comparator { public static String ANON_PREFIX = "_anon"; @@ -171,7 +172,6 @@ public class yacySeed implements Cloneable { public String hash; /** a set of identity founding values, eg. IP, name of the peer, YaCy-version, ...*/ private final ConcurrentHashMap dna; - protected int selectscore = -1; // only for debugging private String alternativeIP = null; public yacySeed(final String theHash, final ConcurrentHashMap theDna) { @@ -858,5 +858,24 @@ public class yacySeed implements Cloneable { ndna.putAll(this.dna); return new yacySeed(this.hash, ndna); } + + @Override + public int compareTo(yacySeed arg0) { + // TODO Auto-generated method stub + int o1 = this.hashCode(); + int o2 = arg0.hashCode(); + if (o1 > o2) return 1; + if (o2 > o1) return -1; + return 0; + } + + public int hashCode() { + return (int) (Base64Order.enhancedCoder.cardinal(this.hash) & ((long) Integer.MAX_VALUE)); + } + + @Override + public int compare(yacySeed o1, yacySeed o2) { + return o1.compareTo(o2); + } } diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 455a3c5d5..ec8f04a0b 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -91,7 +91,7 @@ import org.apache.http.util.EntityUtils; */ public class HTTPClient { - private final static int maxcon = 20; + private final static int maxcon = 200; private static IdledConnectionEvictor idledConnectionEvictor = null; private static HttpClient httpClient = initConnectionManager(); private Header[] headers = null; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 38eaf1254..0e5df294c 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -105,7 +105,7 @@ public final class Condenser { final boolean indexText, final boolean indexMedia, final WordCache meaningLib - ) throws UnsupportedEncodingException { + ) { // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.words = new HashMap(); @@ -254,7 +254,7 @@ public final class Condenser { } } - public Condenser(final InputStream text, final WordCache meaningLib) throws UnsupportedEncodingException { + public Condenser(final InputStream text, final WordCache meaningLib) { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); words = new TreeMap(); @@ -278,7 +278,7 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException { + private void createCondensement(final InputStream is, final WordCache meaningLib) { assert is != null; final Set currsentwords = new HashSet(); StringBuilder sentence = new StringBuilder(100); @@ -461,11 +461,7 @@ public final class Condenser { } catch (UnsupportedEncodingException e1) { buffer = new ByteArrayInputStream(text.getBytes()); } - try { - return new Condenser(buffer, meaningLib).words(); - } catch (final UnsupportedEncodingException e) { - return null; - } + return new Condenser(buffer, meaningLib).words(); } public static void main(final String[] args) { diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index fcf75ba5d..27a25fe33 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -40,7 +40,7 @@ public class WordTokenizer implements Enumeration { private unsievedWordsEnum e; private WordCache meaningLib; - public WordTokenizer(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException { + public WordTokenizer(final InputStream is, final WordCache meaningLib) { assert is != null; this.e = new unsievedWordsEnum(is); this.buffer = nextElement0(); @@ -83,7 +83,7 @@ public class WordTokenizer implements Enumeration { private List s; private int sIndex; - public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { + public unsievedWordsEnum(final InputStream is) { assert is != null; e = new SentenceReader(is); s = new ArrayList();