introduction of dht-burst modes: this can expand the number of target peers in some cases where a better heuristic is needed. The problematic cases are either when a muti-word search is made (still a hard case for our term-oriented DHT) or when a network operator wants that all robinson peers are asked. We therefore introduced two new network steering values that switch on more peers during the peer selection. Because the number of peers can now be very large, the number of maximum httpc connections was also increased.

Please see new coments in yacy.network.freeworld.unit for details of the new DHT selection methods.
The number of maximum peers is now not fixed to a specific number but may increase with
- the partition exponent
- the number of redundant peers
- the robinson burst percentage
- the multiword burst percentage
The maximum can then be the number of senior peers (all visible peers).

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7479 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 4588b5a291
commit 5892fff51f

@ -6,17 +6,68 @@
# this is a work in progress. disabled properties are not yet used #
# -----------------------------------------------------------------#
# general network definition
# define the name of the nework
# this nickname is also used to identifiy network requests
network.unit.name = freeworld
# the visible name of the network
network.unit.description = Public YaCy Community
# definition of the content domain: possible values are:
# global, local, any
network.unit.domain = global
# maximum search time for remote queries (deprecated)
network.unit.search.time = 4
# flag to switch on dht transmission
# if the dht transmission is set to 'false' then for a global
# query all targets are accessed
network.unit.dht = true
# the number of redundant target peers:
# redundant peers get a copy of the original dht target information
network.unit.dhtredundancy.junior = 1
network.unit.dhtredundancy.senior = 3
# the vertical partition of the dht: this applies a division
# of the dht into 2^^<partitionExponent> fragments which get
# all the same word-partition targets but a document-dht computed
# fragment of all documents
network.unit.dht.partitionExponent = 4
# network request burst attributes: this enables non-dht target
# positions for certain situations. This is not a 'traditional' burst-mode
# since it does not refer to a handshake to a single client but it refers
# to not-handshaking in a distributed way. It means to get data without using
# a dht transmission logic.
# robinson burst: percentage of the number of robinson peers that
# shall be accessed for every search. This includes also robinson peers
# that do not have a matching peer tag. If this is set to 100 then all robinson
# peers are always asked
network.unit.dht.burst.robinson = 50
# multi-word burst: percentage of the number of all peers that
# shall be accessed for multi-word searches. Multi-word search is
# a hard problem when the distributed search network is divided by
# term (as done with yacy, partly..).
# Scientific solutions for this problem is to apply heuristics.
# This heuristic enables to switch on a full network scan to get also
# non-distributed multi-word positions. For a full scan set this value to 100.
# Attention: this may out-number the maxcount of available httpc network connections.
network.unit.dht.burst.multiword = 30
# switch to enable verification of search results
# must be set to true in untrusted networks and can be
# set to false in completely trusted networks
network.unit.inspection.searchverify = true
# speed of remote crawl de-queueing. this is the number of milliseconds
# as a pause between two requests
network.unit.remotecrawl.speed = 300
# addresses of seed-list bootstrap locations
network.unit.bootstrap.seedlist0 = http://www.yacy.net/seed.txt
network.unit.bootstrap.seedlist1 = http://home.arcor.de/hermens/yacy/seed.txt
network.unit.bootstrap.seedlist2 = http://low.audioattack.de/yacy/seed.txt

@ -61,6 +61,7 @@ import de.anomic.search.Segment;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.search.ResultEntry;
import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -294,7 +295,7 @@ public final class search {
yacyChannel.channels(yacyChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), ""));
// make event
theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader);
theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
// set statistic details of search result and find best result index set
joincount = theSearch.getRankingResult().getLocalIndexCount() - theSearch.getRankingResult().getMissCount();

@ -520,7 +520,7 @@ public class yacysearch {
theQuery.setOffset(0); // in case that this is a new search, always start without a offset
offset = 0;
}
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader);
final SearchEvent theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, sb.loader, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0));
try {Thread.sleep(global ? 100 : 10);} catch (InterruptedException e1) {} // wait a little time to get first results in the search
if (offset == 0) {

@ -501,13 +501,9 @@ public class YMarkTables {
}
public static TreeMap<String,Word> getWordCounts(final Document document) {
try {
if(document != null) {
return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words());
}
} catch (IOException e) {
Log.logException(e);
}
if (document != null) {
return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words());
}
return new TreeMap<String, Word>();
}

@ -86,7 +86,9 @@ public final class SearchEvent {
final WorkTables workTables,
final SortedMap<byte[], String> preselectedPeerHashes,
final boolean generateAbstracts,
final LoaderDispatcher loader) {
final LoaderDispatcher loader,
final int burstRobinsonPercent,
final int burstMultiwordPercent) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.peers = peers;
this.workTables = workTables;
@ -106,9 +108,7 @@ public final class SearchEvent {
boolean remote = (query.domType == QueryParams.SEARCHDOM_GLOBALDHT) || (query.domType == QueryParams.SEARCHDOM_CLUSTERALL);
if (remote && peers.sizeConnected() == 0) remote = false;
final long start = System.currentTimeMillis();
if (remote) {
final int fetchpeers = 32;
if (remote) {
// initialize a ranking process that is the target for data
// that is generated concurrently from local and global search threads
this.rankingProcess = new RankingProcess(this.query, this.order, max_results_preparation);
@ -118,7 +118,6 @@ public final class SearchEvent {
// start global searches
final long timer = System.currentTimeMillis();
Log.logFine("SEARCH_EVENT", "STARTING " + fetchpeers + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
this.primarySearchThreads = (query.queryHashes.isEmpty()) ? null : yacySearch.primaryRemoteSearches(
QueryParams.hashSet2hashString(query.queryHashes),
QueryParams.hashSet2hashString(query.excludeHashes),
@ -133,11 +132,13 @@ public final class SearchEvent {
peers,
rankingProcess,
secondarySearchSuperviser,
fetchpeers,
Switchboard.urlBlacklist,
query.ranking,
query.constraint,
(query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes);
(query.domType == QueryParams.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes,
burstRobinsonPercent,
burstMultiwordPercent);
Log.logFine("SEARCH_EVENT", "STARTING " + this.primarySearchThreads.length + " THREADS TO CATCH EACH " + query.displayResults() + " URLs");
if (this.primarySearchThreads != null) {
this.rankingProcess.moreFeeders(this.primarySearchThreads.length);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), Type.REMOTESEARCH_START, "", this.primarySearchThreads.length, System.currentTimeMillis() - timer), false);

@ -104,7 +104,9 @@ public class SearchEventCache {
final WorkTables workTables,
final SortedMap<byte[], String> preselectedPeerHashes,
final boolean generateAbstracts,
final LoaderDispatcher loader) {
final LoaderDispatcher loader,
final int burstRobinsonPercent,
final int burstMultiwordPercent) {
String id = query.id(false);
SearchEvent event = SearchEventCache.lastEvents.get(id);
@ -126,7 +128,7 @@ public class SearchEventCache {
}
if (event == null) {
// start a new event
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader);
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, burstRobinsonPercent, burstMultiwordPercent);
}
return event;

@ -28,7 +28,6 @@ package de.anomic.search;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
@ -423,11 +422,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
try {
words = new Condenser(document, true, true, null).words().keySet();
} catch (final UnsupportedEncodingException e) {
Log.logException(e);
}
words = new Condenser(document, true, true, null).words().keySet();
// delete all word references
int count = 0;

@ -49,7 +49,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException;
import java.security.PublicKey;
@ -1834,19 +1833,13 @@ public final class Switchboard extends serverSwitch {
Condenser[] condenser = new Condenser[in.documents.length];
if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
for (int i = 0; i < in.documents.length; i++) {
// strip out words and generate statistics
try {
condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
} catch (final UnsupportedEncodingException e) {
return null;
}
condenser[i] = new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry.profile().indexMedia(), LibraryProvider.dymLib);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
// to compute a URL hash which is necessary for a double-check
final CrawlProfile profile = in.queueEntry.profile();
ResultImages.registerImages(in.queueEntry.url(), in.documents[i], (profile == null) ? true : !profile.remoteIndexing());
}
return new indexingQueueEntry(in.process, in.queueEntry, in.documents, condenser);
}

@ -271,6 +271,10 @@ public final class SwitchboardConstants {
public static final String CLUSTER_MODE_PRIVATE_CLUSTER = "privatecluster";
public static final String CLUSTER_MODE_PUBLIC_PEER = "publicpeer";
public static final String CLUSTER_PEERS_IPPORT = "cluster.peers.ipport";
public static final String DHT_BURST_ROBINSON = "network.unit.dht.burst.robinson";
public static final String DHT_BURST_MULTIWORD = "network.unit.dht.burst.multiword";
/**
* <p><code>public static final String <strong>CRAWLER_THREADS_ACTIVE_MAX</strong> = "crawler.MaxActiveThreads"</code></p>
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>

@ -65,7 +65,7 @@ public interface PartitionScheme {
public long dhtPosition(final byte[] wordHash, final int verticalPosition);
public int verticalPosition(final byte[] urlHash);
public long[] dhtPositions(final byte[] wordHash);
public long dhtDistance(final byte[] word, final String urlHash, final yacySeed peer);

@ -26,9 +26,12 @@ package de.anomic.yacy.dht;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import net.yacy.cora.date.AbstractFormatter;
import net.yacy.cora.storage.DynamicScore;
@ -53,13 +56,109 @@ import de.anomic.yacy.yacyVersion;
*/
public class PeerSelection {
public static void selectDHTPositions(
public static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap<byte[], String> peerhashes) {
final Iterator<Map.Entry<byte[], String>> i = peerhashes.entrySet().iterator();
final List<yacySeed> l = new ArrayList<yacySeed>();
Map.Entry<byte[], String> entry;
yacySeed s;
while (i.hasNext()) {
entry = i.next();
s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time
if (s != null) {
s.setAlternativeAddress(entry.getValue());
l.add(s);
}
}
return l.toArray(new yacySeed[l.size()]);
}
public static yacySeed[] selectSearchTargets(
final yacySeedDB seedDB,
final HandleSet wordhashes,
int redundancy,
int burstRobinsonPercent,
int burstMultiwordPercent) {
// find out a specific number of seeds, that would be relevant for the given word hash(es)
// the result is ordered by relevance: [0] is most relevant
// the seedcount is the maximum number of wanted results
if (seedDB == null) { return null; }
// put in seeds according to dht
final Map<String, yacySeed> regularSeeds = new HashMap<String, yacySeed>(); // dht position seeds
yacySeed seed;
Iterator<yacySeed> dhtEnum;
Iterator<byte[]> iter = wordhashes.iterator();
while (iter.hasNext()) {
selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds);
}
//int minimumseeds = Math.min(seedDB.scheme.verticalPartitions(), regularSeeds.size()); // that should be the minimum number of seeds that are returned
//int maximumseeds = seedDB.scheme.verticalPartitions() * redundancy; // this is the maximum number of seeds according to dht and heuristics. It can be more using burst mode.
// put in some seeds according to size of peer.
// But not all, that would produce too much load on the largest peers
dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT);
int c = Math.max(Math.min(5, seedDB.sizeConnected()), wordhashes.size() > 1 ? seedDB.sizeConnected() * burstMultiwordPercent / 100 : 0);
while (dhtEnum.hasNext() && c-- > 0) {
seed = dhtEnum.next();
if (seed == null) continue;
if (seed.getAge() < 1) { // the 'workshop feature'
Log.logInfo("DHT", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge());
regularSeeds.put(seed.hash, seed);
continue;
}
if (Math.random() * 100 + (wordhashes.size() > 1 ? burstMultiwordPercent : 25) >= 50) {
if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/CountBurst: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount());
regularSeeds.put(seed.hash, seed);
continue;
}
}
// create a set that contains only robinson peers because these get a special handling
dhtEnum = seedDB.seedsConnected(true, false, null, 0.50f);
Set<yacySeed> robinson = new HashSet<yacySeed>();
while (dhtEnum.hasNext()) {
seed = dhtEnum.next();
if (seed == null) continue;
if (!seed.getFlagAcceptRemoteIndex()) robinson.add(seed);
}
// add robinson peers according to robinson burst rate
dhtEnum = robinson.iterator();
c = robinson.size() * burstRobinsonPercent / 100;
while (dhtEnum.hasNext() && c-- > 0) {
seed = dhtEnum.next();
if (Math.random() * 100 + burstRobinsonPercent >= 100) {
if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/RobinsonBurst: " + seed.hash + ":" + seed.getName());
regularSeeds.put(seed.hash, seed);
continue;
}
}
// put in seeds that are public robinson peers and where the peer tags match with query
// or seeds that are newbies to ensure that private demonstrations always work
dhtEnum = robinson.iterator();
while (dhtEnum.hasNext()) {
seed = dhtEnum.next();
if (seed.matchPeerTags(wordhashes)) {
// peer tags match
String specialized = seed.getPeerTags().toString();
if (!specialized.equals("[*]")) Log.logInfo("DHT", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized);
regularSeeds.put(seed.hash, seed);
}
}
// produce return set
yacySeed[] result = new yacySeed[regularSeeds.size()];
result = regularSeeds.values().toArray(result);
return result;
}
private static void selectDHTPositions(
final yacySeedDB seedDB,
byte[] wordhash,
int redundancy,
Map<String, yacySeed> regularSeeds,
DynamicScore<String> ranking) {
Map<String, yacySeed> regularSeeds) {
// this method is called from the search target computation
final long[] dhtVerticalTargets = seedDB.scheme.dhtPositions(wordhash);
yacySeed seed;
@ -72,50 +171,13 @@ public class PeerSelection {
seed = dhtEnum.next();
if (seed == null || seed.hash == null) continue;
if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer
if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c);
ranking.inc(seed.hash, 2 * c);
if (Log.isFine("DHT")) Log.logFine("DHT", "selectPeers/DHTorder: " + seed.hash + ":" + seed.getName() + "/ score " + c);
regularSeeds.put(seed.hash, seed);
c--;
}
}
}
private static int guessedOwn = 0;
public static boolean shallBeOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash, final int redundancy) {
// the guessIfOwnWord is a fast method that should only fail in case that a 'true' may be incorrect, but a 'false' shall always be correct
if (guessIfOwnWord(seedDB, wordhash, urlhash)) {
// this case must be verified, because it can be wrong.
guessedOwn++;
return verifyIfOwnWord(seedDB, wordhash, urlhash, redundancy);
} else {
return false;
}
}
private static boolean guessIfOwnWord(final yacySeedDB seedDB, final byte[] wordhash, final String urlhash) {
if (seedDB == null) return false;
int connected = seedDB.sizeConnected();
if (connected == 0) return true;
final long target = seedDB.scheme.dhtPosition(wordhash, urlhash);
final long mypos = seedDB.scheme.dhtPosition(seedDB.mySeed().hash.getBytes(), urlhash);
long distance = FlatWordPartitionScheme.dhtDistance(target, mypos);
if (distance <= 0) return false;
if (distance <= Long.MAX_VALUE / connected * 2) return true;
return false;
}
private static boolean verifyIfOwnWord(final yacySeedDB seedDB, byte[] wordhash, String urlhash, int redundancy) {
String myHash = seedDB.mySeed().hash;
wordhash = FlatWordPartitionScheme.positionToHash(seedDB.scheme.dhtPosition(wordhash, urlhash));
final Iterator<yacySeed> dhtEnum = getAcceptRemoteIndexSeeds(seedDB, wordhash, redundancy, true);
while (dhtEnum.hasNext()) {
if (dhtEnum.next().hash.equals(myHash)) return true;
}
return false;
}
public static byte[] selectTransferStart() {
return Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(2, 2 + Word.commonHashLength).getBytes();
}
@ -131,7 +193,7 @@ public class PeerSelection {
final byte[] starthash,
int max,
boolean alsoMyOwn) {
final Iterator<yacySeed> seedIter = PeerSelection.getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn);
final Iterator<yacySeed> seedIter = getAcceptRemoteIndexSeeds(seedDB, starthash, max, alsoMyOwn);
final ArrayList<yacySeed> targets = new ArrayList<yacySeed>();
while (seedIter.hasNext() && max-- > 0) targets.add(seedIter.next());
return targets;
@ -159,7 +221,7 @@ public class PeerSelection {
private int remaining;
private boolean alsoMyOwn;
public acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) {
private acceptRemoteIndexSeedEnum(yacySeedDB seedDB, final byte[] starthash, int max, boolean alsoMyOwn) {
this.seedDB = seedDB;
this.se = getDHTSeeds(seedDB, starthash, yacyVersion.YACY_HANDLES_COLLECTION_INDEX);
this.remaining = max;
@ -238,7 +300,7 @@ public class PeerSelection {
private float minVersion;
private yacySeedDB seedDB;
public seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) {
private seedDHTEnum(final yacySeedDB seedDB, final byte[] firstHash, final float minVersion) {
this.seedDB = seedDB;
this.steps = seedDB.sizeConnected();
this.minVersion = minVersion;
@ -290,7 +352,7 @@ public class PeerSelection {
private yacySeed nextSeed;
private yacySeedDB seedDB;
public providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) {
private providesRemoteCrawlURLsEnum(final yacySeedDB seedDB) {
this.seedDB = seedDB;
se = getDHTSeeds(seedDB, null, yacyVersion.YACY_POVIDES_REMOTECRAWL_LISTS);
nextSeed = nextInternal();

@ -422,7 +422,7 @@ public final class yacyClient {
sitehash, authorhash, count, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(),
secondarySearchSuperviser, rankingProfile, constraint);
} catch (final IOException e) {
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + "), score=" + target.selectscore);
yacyCore.log.logInfo("SEARCH failed, Peer: " + target.hash + ":" + target.getName() + " (" + e.getMessage() + ")");
//yacyCore.peerActions.peerDeparture(target, "search request to peer created io exception: " + e.getMessage());
return -1;
}
@ -553,17 +553,12 @@ public final class yacyClient {
}
// generate statistics
if (yacyCore.log.isFine()) yacyCore.log.logFine("SEARCH "
+ result.urlcount
+ " URLS FROM "
+ target.hash
+ ":"
+ target.getName()
+ ", score="
+ target.selectscore
+ ", searchtime=" + result.searchtime + ", netdelay="
+ (totalrequesttime - result.searchtime) + ", references="
+ result.references);
if (yacyCore.log.isFine()) yacyCore.log.logFine(
"SEARCH " + result.urlcount +
" URLS FROM " + target.hash + ":" + target.getName() +
", searchtime=" + result.searchtime +
", netdelay=" + (totalrequesttime - result.searchtime) +
", references=" + result.references);
return result.urlcount;
}

@ -24,16 +24,10 @@
package de.anomic.yacy;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.regex.Pattern;
import net.yacy.cora.storage.DynamicScore;
import net.yacy.cora.storage.ScoreCluster;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Bitfield;
@ -151,104 +145,7 @@ public class yacySearch extends Thread {
public yacySeed target() {
return targetPeer;
}
private static yacySeed[] selectClusterPeers(final yacySeedDB seedDB, final SortedMap<byte[], String> peerhashes) {
final Iterator<Map.Entry<byte[], String>> i = peerhashes.entrySet().iterator();
final List<yacySeed> l = new ArrayList<yacySeed>();
Map.Entry<byte[], String> entry;
yacySeed s;
while (i.hasNext()) {
entry = i.next();
s = seedDB.get(new String(entry.getKey())); // should be getConnected; get only during testing time
if (s != null) {
s.setAlternativeAddress(entry.getValue());
l.add(s);
}
}
// final yacySeed[] result = new yacySeed[l.size()];
// for (int j = 0; j < l.size(); j++) {
// result[j] = l.get(j);
// }
// return result;
return l.toArray(new yacySeed[0]);
}
private static yacySeed[] selectSearchTargets(final yacySeedDB seedDB, final HandleSet wordhashes, int seedcount, int redundancy) {
// find out a specific number of seeds, that would be relevant for the given word hash(es)
// the result is ordered by relevance: [0] is most relevant
// the seedcount is the maximum number of wanted results
if (seedDB == null) { return null; }
if ((seedcount >= seedDB.sizeConnected()) || (seedDB.noDHTActivity())) {
seedcount = seedDB.sizeConnected();
}
// put in seeds according to dht
final DynamicScore<String> ranking = new ScoreCluster<String>();
final Map<String, yacySeed> regularSeeds = new HashMap<String, yacySeed>();
final Map<String, yacySeed> matchingSeeds = new HashMap<String, yacySeed>();
yacySeed seed;
Iterator<yacySeed> dhtEnum;
Iterator<byte[]> iter = wordhashes.iterator();
while (iter.hasNext()) {
PeerSelection.selectDHTPositions(seedDB, iter.next(), redundancy, regularSeeds, ranking);
}
// put in seeds according to size of peer
dhtEnum = seedDB.seedsSortedConnected(false, yacySeed.ICOUNT);
int c = Math.min(seedDB.sizeConnected(), seedcount);
int score;
while (dhtEnum.hasNext() && c > 0) {
seed = dhtEnum.next();
if (seed == null) continue;
if (!seed.getFlagAcceptRemoteIndex()) continue; // probably a robinson peer
score = (int) Math.round(Math.random() * ((c / 3) + 3));
if (Log.isFine("PLASMA")) Log.logFine("PLASMA", "selectPeers/RWIcount: " + seed.hash + ":" + seed.getName() + ", RWIcount=" + seed.getWordCount() + ", score " + score);
ranking.inc(seed.hash, score);
regularSeeds.put(seed.hash, seed);
c--;
}
// put in seeds that are public robinson peers and where the peer tags match with query
// or seeds that are newbies to ensure that public demonstrations always work
dhtEnum = seedDB.seedsConnected(true, false, null, (float) 0.50);
while (dhtEnum.hasNext()) {
seed = dhtEnum.next();
if (seed == null) continue;
if (seed.matchPeerTags(wordhashes)) {
String specialized = seed.getPeerTags().toString();
if (!specialized.equals("[*]")) Log.logInfo("PLASMA", "selectPeers/PeerTags: " + seed.hash + ":" + seed.getName() + ", is specialized peer for " + specialized);
regularSeeds.remove(seed.hash);
ranking.delete(seed.hash);
matchingSeeds.put(seed.hash, seed);
} else if (seed.getFlagAcceptRemoteIndex() && seed.getAge() < 1) { // the 'workshop feature'
Log.logInfo("PLASMA", "selectPeers/Age: " + seed.hash + ":" + seed.getName() + ", is newbie, age = " + seed.getAge());
regularSeeds.remove(seed.hash);
ranking.delete(seed.hash);
matchingSeeds.put(seed.hash, seed);
}
}
// evaluate the ranking score and select seeds
seedcount = Math.min(ranking.size(), seedcount);
final yacySeed[] result = new yacySeed[seedcount + matchingSeeds.size()];
c = 0;
final Iterator<String> iters = ranking.keys(false); // higher are better
while (iters.hasNext() && c < seedcount) {
seed = regularSeeds.get(iters.next());
seed.selectscore = c;
Log.logInfo("PLASMA", "selectPeers/_dht_: " + seed.hash + ":" + seed.getName() + " is choice " + c);
result[c++] = seed;
}
for (final yacySeed s: matchingSeeds.values()) {
s.selectscore = c;
Log.logInfo("PLASMA", "selectPeers/_match_: " + s.hash + ":" + s.getName() + " is choice " + c);
result[c++] = s;
}
// System.out.println("DEBUG yacySearch.selectPeers = " + seedcount + " seeds:"); for (int i = 0; i < seedcount; i++) System.out.println(" #" + i + ":" + result[i]); // debug
return result;
}
public static yacySearch[] primaryRemoteSearches(
final String wordhashes, final String excludehashes,
final Pattern prefer, final Pattern filter, String language,
@ -259,11 +156,12 @@ public class yacySearch extends Thread {
final yacySeedDB peers,
final RankingProcess containerCache,
final SearchEvent.SecondarySearchSuperviser secondarySearchSuperviser,
int targets,
final Blacklist blacklist,
final RankingProfile rankingProfile,
final Bitfield constraint,
final SortedMap<byte[], String> clusterselection) {
final SortedMap<byte[], String> clusterselection,
final int burstRobinsonPercent,
final int burstMultiwordPercent) {
// check own peer status
//if (wordIndex.seedDB.mySeed() == null || wordIndex.seedDB.mySeed().getPublicAddress() == null) { return null; }
@ -272,14 +170,15 @@ public class yacySearch extends Thread {
assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes;
final yacySeed[] targetPeers =
(clusterselection == null) ?
selectSearchTargets(
PeerSelection.selectSearchTargets(
peers,
QueryParams.hashes2Set(wordhashes),
targets,
peers.redundancy())
: selectClusterPeers(peers, clusterselection);
peers.redundancy(),
burstRobinsonPercent,
burstMultiwordPercent)
: PeerSelection.selectClusterPeers(peers, clusterselection);
if (targetPeers == null) return new yacySearch[0];
targets = targetPeers.length;
int targets = targetPeers.length;
if (targets == 0) return new yacySearch[0];
final yacySearch[] searchThreads = new yacySearch[targets];
for (int i = 0; i < targets; i++) {
@ -292,6 +191,7 @@ public class yacySearch extends Thread {
indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint);
searchThreads[i].start();
} catch (OutOfMemoryError e) {
e.printStackTrace();
break;
}
}

@ -47,6 +47,7 @@ import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
@ -71,7 +72,7 @@ import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.dht.FlatWordPartitionScheme;
public class yacySeed implements Cloneable {
public class yacySeed implements Cloneable, Comparable<yacySeed>, Comparator<yacySeed> {
public static String ANON_PREFIX = "_anon";
@ -171,7 +172,6 @@ public class yacySeed implements Cloneable {
public String hash;
/** a set of identity founding values, eg. IP, name of the peer, YaCy-version, ...*/
private final ConcurrentHashMap<String, String> dna;
protected int selectscore = -1; // only for debugging
private String alternativeIP = null;
public yacySeed(final String theHash, final ConcurrentHashMap<String, String> theDna) {
@ -858,5 +858,24 @@ public class yacySeed implements Cloneable {
ndna.putAll(this.dna);
return new yacySeed(this.hash, ndna);
}
@Override
public int compareTo(yacySeed arg0) {
// TODO Auto-generated method stub
int o1 = this.hashCode();
int o2 = arg0.hashCode();
if (o1 > o2) return 1;
if (o2 > o1) return -1;
return 0;
}
public int hashCode() {
return (int) (Base64Order.enhancedCoder.cardinal(this.hash) & ((long) Integer.MAX_VALUE));
}
@Override
public int compare(yacySeed o1, yacySeed o2) {
return o1.compareTo(o2);
}
}

@ -91,7 +91,7 @@ import org.apache.http.util.EntityUtils;
*/
public class HTTPClient {
private final static int maxcon = 20;
private final static int maxcon = 200;
private static IdledConnectionEvictor idledConnectionEvictor = null;
private static HttpClient httpClient = initConnectionManager();
private Header[] headers = null;

@ -105,7 +105,7 @@ public final class Condenser {
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib
) throws UnsupportedEncodingException {
) {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
@ -254,7 +254,7 @@ public final class Condenser {
}
}
public Condenser(final InputStream text, final WordCache meaningLib) throws UnsupportedEncodingException {
public Condenser(final InputStream text, final WordCache meaningLib) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
words = new TreeMap<String, Word>();
@ -278,7 +278,7 @@ public final class Condenser {
return this.languageIdentificator.getLanguage();
}
private void createCondensement(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException {
private void createCondensement(final InputStream is, final WordCache meaningLib) {
assert is != null;
final Set<String> currsentwords = new HashSet<String>();
StringBuilder sentence = new StringBuilder(100);
@ -461,11 +461,7 @@ public final class Condenser {
} catch (UnsupportedEncodingException e1) {
buffer = new ByteArrayInputStream(text.getBytes());
}
try {
return new Condenser(buffer, meaningLib).words();
} catch (final UnsupportedEncodingException e) {
return null;
}
return new Condenser(buffer, meaningLib).words();
}
public static void main(final String[] args) {

@ -40,7 +40,7 @@ public class WordTokenizer implements Enumeration<String> {
private unsievedWordsEnum e;
private WordCache meaningLib;
public WordTokenizer(final InputStream is, final WordCache meaningLib) throws UnsupportedEncodingException {
public WordTokenizer(final InputStream is, final WordCache meaningLib) {
assert is != null;
this.e = new unsievedWordsEnum(is);
this.buffer = nextElement0();
@ -83,7 +83,7 @@ public class WordTokenizer implements Enumeration<String> {
private List<StringBuilder> s;
private int sIndex;
public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException {
public unsievedWordsEnum(final InputStream is) {
assert is != null;
e = new SentenceReader(is);
s = new ArrayList<StringBuilder>();

Loading…
Cancel
Save