diff --git a/defaults/yacy.network.freeworld.unit b/defaults/yacy.network.freeworld.unit index 120370c7d..a484ec455 100644 --- a/defaults/yacy.network.freeworld.unit +++ b/defaults/yacy.network.freeworld.unit @@ -6,17 +6,68 @@ # this is a work in progress. disabled properties are not yet used # # -----------------------------------------------------------------# -# general network definition +# define the name of the nework +# this nickname is also used to identifiy network requests network.unit.name = freeworld + +# the visible name of the network network.unit.description = Public YaCy Community + +# definition of the content domain: possible values are: +# global, local, any network.unit.domain = global + +# maximum search time for remote queries (deprecated) network.unit.search.time = 4 + +# flag to switch on dht transmission +# if the dht transmission is set to 'false' then for a global +# query all targets are accessed network.unit.dht = true + +# the number of redundant target peers: +# redundant peers get a copy of the original dht target information network.unit.dhtredundancy.junior = 1 network.unit.dhtredundancy.senior = 3 + +# the vertical partition of the dht: this applies a division +# of the dht into 2^^ fragments which get +# all the same word-partition targets but a document-dht computed +# fragment of all documents network.unit.dht.partitionExponent = 4 + +# network request burst attributes: this enables non-dht target +# positions for certain situations. This is not a 'traditional' burst-mode +# since it does not refer to a handshake to a single client but it refers +# to not-handshaking in a distributed way. It means to get data without using +# a dht transmission logic. + +# robinson burst: percentage of the number of robinson peers that +# shall be accessed for every search. This includes also robinson peers +# that do not have a matching peer tag. If this is set to 100 then all robinson +# peers are always asked +network.unit.dht.burst.robinson = 50 + +# multi-word burst: percentage of the number of all peers that +# shall be accessed for multi-word searches. Multi-word search is +# a hard problem when the distributed search network is divided by +# term (as done with yacy, partly..). +# Scientific solutions for this problem is to apply heuristics. +# This heuristic enables to switch on a full network scan to get also +# non-distributed multi-word positions. For a full scan set this value to 100. +# Attention: this may out-number the maxcount of available httpc network connections. +network.unit.dht.burst.multiword = 30 + +# switch to enable verification of search results +# must be set to true in untrusted networks and can be +# set to false in completely trusted networks network.unit.inspection.searchverify = true + +# speed of remote crawl de-queueing. this is the number of milliseconds +# as a pause between two requests network.unit.remotecrawl.speed = 300 + +# addresses of seed-list bootstrap locations network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index bb1a5f4ec..2af71e16c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -61,6 +61,7 @@ import de.anomic.search.Segment; import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.search.ResultEntry; +import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -294,7 +295,7 @@ public final class search { yacyChannel.channels(yacyChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); // make event - theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader); + theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); // set statistic details of search result and find best result index set joincount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount(); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index cb870ec22..bb03e6dc2 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -520,7 +520,7 @@ public class yacysearch { theQuery.setOffset(0); // in case that this is a new search, always start without a offset offset = 0; } - final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader); + final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search if (offset == 0) { diff --git a/source/de/anomic/data/YMarkTables.java b/source/de/anomic/data/YMarkTables.java index c9a889668..05679b1aa 100644 --- a/source/de/anomic/data/YMarkTables.java +++ b/source/de/anomic/data/YMarkTables.java @@ -501,13 +501,9 @@ public class YMarkTables { } public static TreeMap getWordCounts(final Document document) { - try { - if(document != null) { - return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); - } - } catch (IOException e) { - Log.logException(e); - } + if (document != null) { + return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words()); + } return new TreeMap(); } diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 804972835..a869bb272 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -86,7 +86,9 @@ public final class SearchEvent { final WorkTables workTables, final SortedMap preselectedPeerHashes, final boolean generateAbstracts, - final LoaderDispatcher loader) { + final LoaderDispatcher loader, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { this.eventTime = System.currentTimeMillis(); // for lifetime check this.peers = peers; this.workTables = workTables; @@ -106,9 +108,7 @@ public final class SearchEvent { boolean remote = (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) || (query.domType == QueryParams.SEARCHDOM_CLUSTERALL); if (remote && peers.sizeConnected() == 0) remote = false; final long start = System.currentTimeMillis(); - if (remote) { - final int fetchpeers = 32; - + if (remote) { // initialize a ranking process that is the target for data // that is generated concurrently from local and global search threads this.rankingProcess = new RankingProcess(this.query, this.order, max_results_preparation); @@ -118,7 +118,6 @@ public final class SearchEvent { // start global searches final long timer = System.currentTimeMillis(); - Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); this.primarySearchThreads = (query.queryHashes.isEmpty()) ? null : yacySearch.primaryRemoteSearches( QueryParams.hashSet2hashString(query.queryHashes), QueryParams.hashSet2hashString(query.excludeHashes), @@ -133,11 +132,13 @@ public final class SearchEvent { peers, rankingProcess, secondarySearchSuperviser, - fetchpeers, Switchboard.urlBlacklist, query.ranking, query.constraint, - (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); + (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes, + burstRobinsonPercent, + burstMultiwordPercent); + Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + query.displayResults() + " URLs"); if (this.primarySearchThreads != null) { this.rankingProcess.moreFeeders(this.primarySearchThreads.length); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false); diff --git a/source/de/anomic/search/SearchEventCache.java b/source/de/anomic/search/SearchEventCache.java index b821cea74..194cb89dc 100644 --- a/source/de/anomic/search/SearchEventCache.java +++ b/source/de/anomic/search/SearchEventCache.java @@ -104,7 +104,9 @@ public class SearchEventCache { final WorkTables workTables, final SortedMap preselectedPeerHashes, final boolean generateAbstracts, - final LoaderDispatcher loader) { + final LoaderDispatcher loader, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { String id = query.id(false); SearchEvent event = SearchEventCache.lastEvents.get(id); @@ -126,7 +128,7 @@ public class SearchEventCache { } if (event == null) { // start a new event - event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader); + event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, burstRobinsonPercent, burstMultiwordPercent); } return event; diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 01baea82b..4861d7857 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -28,7 +28,6 @@ package de.anomic.search; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.Date; import java.util.Iterator; import java.util.Map; @@ -423,11 +422,7 @@ public class Segment { } // get the word set Set words = null; - try { - words = new Condenser(document, true, true, null).words().keySet(); - } catch (final UnsupportedEncodingException e) { - Log.logException(e); - } + words = new Condenser(document, true, true, null).words().keySet(); // delete all word references int count = 0; diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index ce74e58ab..cc44c554c 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -49,7 +49,6 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.Reader; -import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; import java.security.PublicKey; @@ -1834,19 +1833,13 @@ public final class Switchboard extends serverSwitch { Condenser[] condenser = new Condenser[in.documents.length]; if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'"); for (int i = 0; i < in.documents.length; i++) { - // strip out words and generate statistics - try { - condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib); - - // update image result list statistics - // its good to do this concurrently here, because it needs a DNS lookup - // to compute a URL hash which is necessary for a double-check - final CrawlProfile profile = in.queueEntry.profile(); - ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); - - } catch (final UnsupportedEncodingException e) { - return null; - } + condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib); + + // update image result list statistics + // its good to do this concurrently here, because it needs a DNS lookup + // to compute a URL hash which is necessary for a double-check + final CrawlProfile profile = in.queueEntry.profile(); + ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing()); } return new indexingQueueEntry(in.process, in.queueEntry, in.documents, condenser); } diff --git a/source/de/anomic/search/SwitchboardConstants.java b/source/de/anomic/search/SwitchboardConstants.java index fe3c0fe65..f0f006c8b 100644 --- a/source/de/anomic/search/SwitchboardConstants.java +++ b/source/de/anomic/search/SwitchboardConstants.java @@ -271,6 +271,10 @@ public final class SwitchboardConstants { public static final String CLUSTER_MODE_PRIVATE_CLUSTER = "privatecluster"; public static final String CLUSTER_MODE_PUBLIC_PEER = "publicpeer"; public static final String CLUSTER_PEERS_IPPORT = "cluster.peers.ipport"; + + public static final String DHT_BURST_ROBINSON = "network.unit.dht.burst.robinson"; + public static final String DHT_BURST_MULTIWORD = "network.unit.dht.burst.multiword"; + /** *

public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads"

*

Name of the setting how many active crawler-threads may maximal be running on the same time

diff --git a/source/de/anomic/yacy/dht/PartitionScheme.java b/source/de/anomic/yacy/dht/PartitionScheme.java index c897f41b0..de35fb832 100755 --- a/source/de/anomic/yacy/dht/PartitionScheme.java +++ b/source/de/anomic/yacy/dht/PartitionScheme.java @@ -65,7 +65,7 @@ public interface PartitionScheme { public long dhtPosition(final byte[] wordHash, final int verticalPosition); public int verticalPosition(final byte[] urlHash); - + public long[] dhtPositions(final byte[] wordHash); public long dhtDistance(final byte[] word, final String urlHash, final yacySeed peer); diff --git a/source/de/anomic/yacy/dht/PeerSelection.java b/source/de/anomic/yacy/dht/PeerSelection.java index 7b13063bf..4795049d1 100755 --- a/source/de/anomic/yacy/dht/PeerSelection.java +++ b/source/de/anomic/yacy/dht/PeerSelection.java @@ -26,9 +26,12 @@ package de.anomic.yacy.dht; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.SortedMap; import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.storage.DynamicScore; @@ -53,13 +56,109 @@ import de.anomic.yacy.yacyVersion; */ public class PeerSelection { - - public static void selectDHTPositions( + + public static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap peerhashes) { + final Iterator> i = peerhashes.entrySet().iterator(); + final List l = new ArrayList(); + Map.Entry entry; + yacySeed s; + while (i.hasNext()) { + entry = i.next(); + s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time + if (s != null) { + s.setAlternativeAddress(entry.getValue()); + l.add(s); + } + } + return l.toArray(new yacySeed[l.size()]); + } + + public static yacySeed[] selectSearchTargets( + final yacySeedDB seedDB, + final HandleSet wordhashes, + int redundancy, + int burstRobinsonPercent, + int burstMultiwordPercent) { + // find out a specific number of seeds, that would be relevant for the given word hash(es) + // the result is ordered by relevance: [0] is most relevant + // the seedcount is the maximum number of wanted results + if (seedDB == null) { return null; } + + // put in seeds according to dht + final Map regularSeeds = new HashMap(); // dht position seeds + yacySeed seed; + Iterator dhtEnum; + Iterator iter = wordhashes.iterator(); + while (iter.hasNext()) { + selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds); + } + //int minimumseeds = Math.min(seedDB.scheme.verticalPartitions(), regularSeeds.size()); // that should be the minimum number of seeds that are returned + //int maximumseeds = seedDB.scheme.verticalPartitions() * redundancy; // this is the maximum number of seeds according to dht and heuristics. It can be more using burst mode. + + // put in some seeds according to size of peer. + // But not all, that would produce too much load on the largest peers + dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT); + int c = Math.max(Math.min(5, seedDB.sizeConnected()), wordhashes.size() > 1 ? seedDB.sizeConnected() * burstMultiwordPercent / 100 : 0); + while (dhtEnum.hasNext() && c-- > 0) { + seed = dhtEnum.next(); + if (seed == null) continue; + if (seed.getAge() < 1) { // the 'workshop feature' + Log.logInfo("DHT", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge()); + regularSeeds.put(seed.hash, seed); + continue; + } + if (Math.random() * 100 + (wordhashes.size() > 1 ? burstMultiwordPercent : 25) >= 50) { + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/CountBurst: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount()); + regularSeeds.put(seed.hash, seed); + continue; + } + } + + // create a set that contains only robinson peers because these get a special handling + dhtEnum = seedDB.seedsConnected(true, false, null, 0.50f); + Set robinson = new HashSet(); + while (dhtEnum.hasNext()) { + seed = dhtEnum.next(); + if (seed == null) continue; + if (!seed.getFlagAcceptRemoteIndex()) robinson.add(seed); + } + + // add robinson peers according to robinson burst rate + dhtEnum = robinson.iterator(); + c = robinson.size() * burstRobinsonPercent / 100; + while (dhtEnum.hasNext() && c-- > 0) { + seed = dhtEnum.next(); + if (Math.random() * 100 + burstRobinsonPercent >= 100) { + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/RobinsonBurst: " + seed.hash + ":" + seed.getName()); + regularSeeds.put(seed.hash, seed); + continue; + } + } + + // put in seeds that are public robinson peers and where the peer tags match with query + // or seeds that are newbies to ensure that private demonstrations always work + dhtEnum = robinson.iterator(); + while (dhtEnum.hasNext()) { + seed = dhtEnum.next(); + if (seed.matchPeerTags(wordhashes)) { + // peer tags match + String specialized = seed.getPeerTags().toString(); + if (!specialized.equals("[*]")) Log.logInfo("DHT", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); + regularSeeds.put(seed.hash, seed); + } + } + + // produce return set + yacySeed[] result = new yacySeed[regularSeeds.size()]; + result = regularSeeds.values().toArray(result); + return result; + } + + private static void selectDHTPositions( final yacySeedDB seedDB, byte[] wordhash, int redundancy, - Map regularSeeds, - DynamicScore ranking) { + Map regularSeeds) { // this method is called from the search target computation final long[] dhtVerticalTargets = seedDB.scheme.dhtPositions(wordhash); yacySeed seed; @@ -72,50 +171,13 @@ public class PeerSelection { seed = dhtEnum.next(); if (seed == null || seed.hash == null) continue; if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer - if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c); - ranking.inc(seed.hash, 2 * c); + if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c); regularSeeds.put(seed.hash, seed); c--; } } } - - private static int guessedOwn = 0; - - public static boolean shallBeOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash, final int redundancy) { - // the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct - if (guessIfOwnWord(seedDB, wordhash, urlhash)) { - // this case must be verified, because it can be wrong. - guessedOwn++; - return verifyIfOwnWord(seedDB, wordhash, urlhash, redundancy); - } else { - return false; - } - - } - - private static boolean guessIfOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash) { - if (seedDB == null) return false; - int connected = seedDB.sizeConnected(); - if (connected == 0) return true; - final long target = seedDB.scheme.dhtPosition(wordhash, urlhash); - final long mypos = seedDB.scheme.dhtPosition(seedDB.mySeed().hash.getBytes(), urlhash); - long distance = FlatWordPartitionScheme.dhtDistance(target, mypos); - if (distance <= 0) return false; - if (distance <= Long.MAX_VALUE / connected * 2) return true; - return false; - } - - private static boolean verifyIfOwnWord(final yacySeedDB seedDB, byte[] wordhash, String urlhash, int redundancy) { - String myHash = seedDB.mySeed().hash; - wordhash = FlatWordPartitionScheme.positionToHash(seedDB.scheme.dhtPosition(wordhash, urlhash)); - final Iterator dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true); - while (dhtEnum.hasNext()) { - if (dhtEnum.next().hash.equals(myHash)) return true; - } - return false; - } - + public static byte[] selectTransferStart() { return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(2, 2 + Word.commonHashLength).getBytes(); } @@ -131,7 +193,7 @@ public class PeerSelection { final byte[] starthash, int max, boolean alsoMyOwn) { - final Iterator seedIter = PeerSelection.getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn); + final Iterator seedIter = getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn); final ArrayList targets = new ArrayList(); while (seedIter.hasNext() && max-- > 0) targets.add(seedIter.next()); return targets; @@ -159,7 +221,7 @@ public class PeerSelection { private int remaining; private boolean alsoMyOwn; - public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) { + private acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) { this.seedDB = seedDB; this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX); this.remaining = max; @@ -238,7 +300,7 @@ public class PeerSelection { private float minVersion; private yacySeedDB seedDB; - public seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) { + private seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) { this.seedDB = seedDB; this.steps = seedDB.sizeConnected(); this.minVersion = minVersion; @@ -290,7 +352,7 @@ public class PeerSelection { private yacySeed nextSeed; private yacySeedDB seedDB; - public providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) { + private providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) { this.seedDB = seedDB; se = getDHTSeeds(seedDB, null, yacyVersion.YACY_POVIDES_REMOTECRAWL_LISTS); nextSeed = nextInternal(); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 0c2e09395..0a6ad6806 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -422,7 +422,7 @@ public final class yacyClient { sitehash, authorhash, count, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(), secondarySearchSuperviser, rankingProfile, constraint); } catch (final IOException e) { - yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore); + yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + ")"); //yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage()); return -1; } @@ -553,17 +553,12 @@ public final class yacyClient { } // generate statistics - if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH " - + result.urlcount - + " URLS FROM " - + target.hash - + ":" - + target.getName() - + ", score=" - + target.selectscore - + ", searchtime=" + result.searchtime + ", netdelay=" - + (totalrequesttime - result.searchtime) + ", references=" - + result.references); + if (yacyCore.log.isFine()) yacyCore.log.logFine( + "SEARCH " + result.urlcount + + " URLS FROM " + target.hash + ":" + target.getName() + + ", searchtime=" + result.searchtime + + ", netdelay=" + (totalrequesttime - result.searchtime) + + ", references=" + result.references); return result.urlcount; } diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 9704677af..089b2f51e 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -24,16 +24,10 @@ package de.anomic.yacy; -import java.util.ArrayList; -import java.util.HashMap; import java.util.Iterator; -import java.util.List; -import java.util.Map; import java.util.SortedMap; import java.util.regex.Pattern; -import net.yacy.cora.storage.DynamicScore; -import net.yacy.cora.storage.ScoreCluster; import net.yacy.kelondro.index.HandleSet; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Bitfield; @@ -151,104 +145,7 @@ public class yacySearch extends Thread { public yacySeed target() { return targetPeer; } - - private static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap peerhashes) { - final Iterator> i = peerhashes.entrySet().iterator(); - final List l = new ArrayList(); - Map.Entry entry; - yacySeed s; - while (i.hasNext()) { - entry = i.next(); - s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time - if (s != null) { - s.setAlternativeAddress(entry.getValue()); - l.add(s); - } - } -// final yacySeed[] result = new yacySeed[l.size()]; -// for (int j = 0; j < l.size(); j++) { -// result[j] = l.get(j); -// } -// return result; - return l.toArray(new yacySeed[0]); - } - private static yacySeed[] selectSearchTargets(final yacySeedDB seedDB, final HandleSet wordhashes, int seedcount, int redundancy) { - // find out a specific number of seeds, that would be relevant for the given word hash(es) - // the result is ordered by relevance: [0] is most relevant - // the seedcount is the maximum number of wanted results - if (seedDB == null) { return null; } - if ((seedcount >= seedDB.sizeConnected()) || (seedDB.noDHTActivity())) { - seedcount = seedDB.sizeConnected(); - } - - // put in seeds according to dht - final DynamicScore ranking = new ScoreCluster(); - final Map regularSeeds = new HashMap(); - final Map matchingSeeds = new HashMap(); - yacySeed seed; - Iterator dhtEnum; - Iterator iter = wordhashes.iterator(); - while (iter.hasNext()) { - PeerSelection.selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds, ranking); - } - - // put in seeds according to size of peer - dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT); - int c = Math.min(seedDB.sizeConnected(), seedcount); - int score; - while (dhtEnum.hasNext() && c > 0) { - seed = dhtEnum.next(); - if (seed == null) continue; - if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer - score = (int) Math.round(Math.random() * ((c / 3) + 3)); - if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/RWIcount: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount() + ", score " + score); - ranking.inc(seed.hash, score); - regularSeeds.put(seed.hash, seed); - c--; - } - - // put in seeds that are public robinson peers and where the peer tags match with query - // or seeds that are newbies to ensure that public demonstrations always work - dhtEnum = seedDB.seedsConnected(true, false, null, (float) 0.50); - while (dhtEnum.hasNext()) { - seed = dhtEnum.next(); - if (seed == null) continue; - if (seed.matchPeerTags(wordhashes)) { - String specialized = seed.getPeerTags().toString(); - if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized); - regularSeeds.remove(seed.hash); - ranking.delete(seed.hash); - matchingSeeds.put(seed.hash, seed); - } else if (seed.getFlagAcceptRemoteIndex() && seed.getAge() < 1) { // the 'workshop feature' - Log.logInfo("PLASMA", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge()); - regularSeeds.remove(seed.hash); - ranking.delete(seed.hash); - matchingSeeds.put(seed.hash, seed); - } - } - - // evaluate the ranking score and select seeds - seedcount = Math.min(ranking.size(), seedcount); - final yacySeed[] result = new yacySeed[seedcount + matchingSeeds.size()]; - c = 0; - final Iterator iters = ranking.keys(false); // higher are better - while (iters.hasNext() && c < seedcount) { - seed = regularSeeds.get(iters.next()); - seed.selectscore = c; - Log.logInfo("PLASMA", "selectPeers/_dht_: " + seed.hash + ":" + seed.getName() + " is choice " + c); - result[c++] = seed; - } - for (final yacySeed s: matchingSeeds.values()) { - s.selectscore = c; - Log.logInfo("PLASMA", "selectPeers/_match_: " + s.hash + ":" + s.getName() + " is choice " + c); - result[c++] = s; - } - -// System.out.println("DEBUG yacySearch.selectPeers = " + seedcount + " seeds:"); for (int i = 0; i < seedcount; i++) System.out.println(" #" + i + ":" + result[i]); // debug - return result; - } - public static yacySearch[] primaryRemoteSearches( final String wordhashes, final String excludehashes, final Pattern prefer, final Pattern filter, String language, @@ -259,11 +156,12 @@ public class yacySearch extends Thread { final yacySeedDB peers, final RankingProcess containerCache, final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser, - int targets, final Blacklist blacklist, final RankingProfile rankingProfile, final Bitfield constraint, - final SortedMap clusterselection) { + final SortedMap clusterselection, + final int burstRobinsonPercent, + final int burstMultiwordPercent) { // check own peer status //if (wordIndex.seedDB.mySeed() == null || wordIndex.seedDB.mySeed().getPublicAddress() == null) { return null; } @@ -272,14 +170,15 @@ public class yacySearch extends Thread { assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes; final yacySeed[] targetPeers = (clusterselection == null) ? - selectSearchTargets( + PeerSelection.selectSearchTargets( peers, QueryParams.hashes2Set(wordhashes), - targets, - peers.redundancy()) - : selectClusterPeers(peers, clusterselection); + peers.redundancy(), + burstRobinsonPercent, + burstMultiwordPercent) + : PeerSelection.selectClusterPeers(peers, clusterselection); if (targetPeers == null) return new yacySearch[0]; - targets = targetPeers.length; + int targets = targetPeers.length; if (targets == 0) return new yacySearch[0]; final yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { @@ -292,6 +191,7 @@ public class yacySearch extends Thread { indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); searchThreads[i].start(); } catch (OutOfMemoryError e) { + e.printStackTrace(); break; } } diff --git a/source/de/anomic/yacy/yacySeed.java b/source/de/anomic/yacy/yacySeed.java index 772469525..a4148554c 100644 --- a/source/de/anomic/yacy/yacySeed.java +++ b/source/de/anomic/yacy/yacySeed.java @@ -47,6 +47,7 @@ import java.io.IOException; import java.net.InetAddress; import java.net.MalformedURLException; import java.net.URL; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -71,7 +72,7 @@ import de.anomic.tools.bitfield; import de.anomic.tools.crypt; import de.anomic.yacy.dht.FlatWordPartitionScheme; -public class yacySeed implements Cloneable { +public class yacySeed implements Cloneable, Comparable, Comparator { public static String ANON_PREFIX = "_anon"; @@ -171,7 +172,6 @@ public class yacySeed implements Cloneable { public String hash; /** a set of identity founding values, eg. IP, name of the peer, YaCy-version, ...*/ private final ConcurrentHashMap dna; - protected int selectscore = -1; // only for debugging private String alternativeIP = null; public yacySeed(final String theHash, final ConcurrentHashMap theDna) { @@ -858,5 +858,24 @@ public class yacySeed implements Cloneable { ndna.putAll(this.dna); return new yacySeed(this.hash, ndna); } + + @Override + public int compareTo(yacySeed arg0) { + // TODO Auto-generated method stub + int o1 = this.hashCode(); + int o2 = arg0.hashCode(); + if (o1 > o2) return 1; + if (o2 > o1) return -1; + return 0; + } + + public int hashCode() { + return (int) (Base64Order.enhancedCoder.cardinal(this.hash) & ((long) Integer.MAX_VALUE)); + } + + @Override + public int compare(yacySeed o1, yacySeed o2) { + return o1.compareTo(o2); + } } diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 455a3c5d5..ec8f04a0b 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -91,7 +91,7 @@ import org.apache.http.util.EntityUtils; */ public class HTTPClient { - private final static int maxcon = 20; + private final static int maxcon = 200; private static IdledConnectionEvictor idledConnectionEvictor = null; private static HttpClient httpClient = initConnectionManager(); private Header[] headers = null; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 38eaf1254..0e5df294c 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -105,7 +105,7 @@ public final class Condenser { final boolean indexText, final boolean indexMedia, final WordCache meaningLib - ) throws UnsupportedEncodingException { + ) { // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.words = new HashMap(); @@ -254,7 +254,7 @@ public final class Condenser { } } - public Condenser(final InputStream text, final WordCache meaningLib) throws UnsupportedEncodingException { + public Condenser(final InputStream text, final WordCache meaningLib) { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); words = new TreeMap(); @@ -278,7 +278,7 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException { + private void createCondensement(final InputStream is, final WordCache meaningLib) { assert is != null; final Set currsentwords = new HashSet(); StringBuilder sentence = new StringBuilder(100); @@ -461,11 +461,7 @@ public final class Condenser { } catch (UnsupportedEncodingException e1) { buffer = new ByteArrayInputStream(text.getBytes()); } - try { - return new Condenser(buffer, meaningLib).words(); - } catch (final UnsupportedEncodingException e) { - return null; - } + return new Condenser(buffer, meaningLib).words(); } public static void main(final String[] args) { diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index fcf75ba5d..27a25fe33 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -40,7 +40,7 @@ public class WordTokenizer implements Enumeration { private unsievedWordsEnum e; private WordCache meaningLib; - public WordTokenizer(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException { + public WordTokenizer(final InputStream is, final WordCache meaningLib) { assert is != null; this.e = new unsievedWordsEnum(is); this.buffer = nextElement0(); @@ -83,7 +83,7 @@ public class WordTokenizer implements Enumeration { private List s; private int sIndex; - public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { + public unsievedWordsEnum(final InputStream is) { assert is != null; e = new SentenceReader(is); s = new ArrayList();