updated ranking tables (fresh computation)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8103 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 070d32ae21
commit bc5df0eef5

@ -1 +1 @@
3o06fQFARhBYG1fyaaLgIKhaO0510AhYPjMYsX0sAB
1ZZwWAumBqYA

@ -1 +1 @@
6zDuRYGIdeZYR-J6JRc14c_YpT2NrQy9plPa
-6gsqA020wSA1IWRpA2DGiZA3I173D4LHf1A4aCKyCFARhBYIgp3zAKYs_BDKlGntBP4hN7BTWSa6BWOOkgC_aJQZBlqygQCskjqmAyAS1GZ

@ -1 +1 @@
-eu5sA-us6-Y0j9n6Q2dc2nY7694GZLEGNQZR473lZWe1RRBYUGRwYaIwR3Aay29rSoTEsB4vVZ-CRyrZU6Z
-dsznA0ffeYa0k8HYA1buONZ1hp4CZ1rmZEB2I2Z4a2OMHFD2dS4GC2snssC2xfZlA2yPtnA3_cOCZ3rYBLA43_QFA4DC3rZ4QT2QA4Xr0oT4aOOeC4gQ-UB4kBsyB4soqED4uvaoD4yEsRB569MNB5A85JD5AFAhB5NS-6D5OYk8A5T24mD5THrdA5qgbGB5sF1bD5unnvB6QOSwC6dTQBB6rptXD6usA-C76no0B7ka2oA7t6KsA9P-MCD9W06lD9x1MdAACBIaAAZvWMAB-1LsCB9ysWbBnGOKDCGjOTACbH3hCDpqPaBE4hAfBE73e0AE8wOQAEF18XDEclAMCEx09oBFLJT1AG-9VGbG121OCGKUAiQGYhaABGw3c_YHIYNRDHfJp2aHkNBlaJbhHIAK9ym_CKSZlzBKXICRBKZjSDDKg1NMZKokAECLpLpxCLpgIEbLzLEwAMXbAIDO1venZO9pUMAP4b63APW0DgAPhE6DDPpqN1BRDGZUARSMevBRZLwbDRyBkdQS5cf9YSA1H_DTQhlWCTQjllBTjKy9BU8tUYBUBU-UYUcb1FCUibu4AUjcguAUxQV7BVW4SCZVa3wTZVd39RAW6mcGQWvYdQbX5NQ7CXnME8aXqinsbXyU37DYTrnFCYvFrgBZceQYRZlEtda_5TwUB_9DWtB_BjKnDb5HNeAbI925CbJ8MBDbk0LFBboVvbBc6LB8Dc8kOWBcVIH9BcyK-7Dd3k77De6dtuAebtYjBep8doBfsVc0Ag-r_RBg7iLSDgBdF4ZgH2ToAg_tp9CgmKYUDgrGhFAhL2xDDiW6C7BicZSNDirHC-CjEI-jDjWy2JAkMqVVBkpOOUBkt_iIAl10fQDl1EgfalDSGZClN8sqAleZuuAm5lRIBmRbv6AmcsyJCmmz4-CmqQctCmxKFhAnOxx8Dnmc3wAnvUXNDnx345Bo2GfQCo2uawAo59ENBoTbaJbpV6YOBpaytfDq8ncZDqh9jGYqswa8BqyohEAr5F6PCr8wkPDr9NbuAroYZAYs5cngBsSmyICsX0sABs_VxeDsm2fwDsoevXCt1drJAt3YuIAt5KYbDtzQqwAuC9y_buMQoSAuPaHWSuT9E5BuisYQDvEp4rYvbE4iZvbmk1BverBJBvozSXRvuELQDvwejWAxawNOCxgso4ByKwk0Byr3IcAysR9FBysqjTBz5Lh6BzXCNNA

@ -1 +1 @@
0kVGgD2JvFCh7eJMoQ7uXagC8I8z7B9VB_qBFp2fRQGThWkZGUAvOCK0oBfCKB4M3AMZ-kjZNANzdAOObI4BQVXU4BVUOQ5AXuknKQ_cbjnCcRaiLZhzy8YQlzZYMQmtlFPBoDRuDCt343uBtDH4rQvHR65BzntqxZ
-9a0tA-Ho8KA-IWRhA-N6LGB-RfRyA-WLVRR-pvIiR-zTGUY015OzA0CVWgA0Eck_A0F5BNA0FS6WD0Gf7WA0KHQrA0PMz3I0XgAcR0c-4sC0svxdA18HEbA19GMfA1AYK5A1GTzRA1WS-FA1uitOA1ygfnA1zncVA245hwA249wFA25q33A26YBgA2I4vyA2ILBAA2NyeoQ2PjZIA2UyolR2XVY8A2qM26A36DOEA3HFtDb3Je4gB3Pc7zR3UPfMA3_mhYA3_sELA3d6cwA3eqjQc3ooEUR3rXPsA3vbbOA3ydd7A41Qv4d44TBEA44yqKR47_4CA4BjcUA4KmySA4OwJZR4SMIEd4hW28D4kfwyA4mEdyA52tZEA58gsDA5DW-mA5PxFkA5QBqrA5RXd5A5SCrfA5Z0gIA5aMo2A5j6xBA5urnTR657elA6Cfy1A6OHmmB6PE77A6nYg1A6r62bR6vzwpR6wSzfA718X6A72gqQR76A2hT78gDIA7P3bNA7QRGAR7VlyiA885vKA8GQxVA8XBdpD8Xy6JA8xeLyA9-C6bZ94Ul4R9Fz0TA9PEwbR9aOzyc9dQuuR9jp3RA9qWyrA9rncTRAeoi-BC7kqvRCY2c1RCmbCgRCv0a1ADUIDrRDdMYvBDqWXjBE1SP3AE7iKlIEF8TAAEr2ZKREx_F6AF0ctZdF43jyRFHzvbRFNgKsRFxvahAG1JUicGAISPAGOsmRRGca52RGnWhFRH6wH8RI2Y5OAIgCE6dIj-97RInSN4ZJa0O_RK0KytRKJNAWAKRMIGRKWoccRLNOmMRLfQpzRM4zGZRMuecrAMy_0nYN-Q06RNVkbvRNxlJVRO6ZOBROFHCGRORipz4Od1hPROra0QROrgT9SOw50sROzLRuRPQNvdRPQr5MDPsK4aYPt81sRQ9SYcRQB6BmRQB6BmdQExs3RQHrJ3RQrPWDYQtDCeRR8gvxRRCyYyRRqwHERSSPtPRSVShmRSz7sxRTE6zHRTNBKMRTm8mkRTtpu-RUcqoyRUiwrFDVHUyHBVnuDNAVnuWKRVv-ziAW87FcRW9-QvRX7t7PAXPZo8RXSojIRXU2jURXmye6AY0HdSZY1b2BRYFYX4RYbg8GRZAumJRZNADHRZhnwNR_6JtuA_HyQ7A_O63wA_i190A_qwbfAa7-5QRaA-43RaNsjxRamVE8Qan3k9RbdvLoBcRr37DcvlccRcziVuRd_BoCRdvZwyReO0LKReWCuUAebXJGaehw1aRfIu9GYfbgqaRgRTGCAgWRHVRgYdleRiUOLbRiW10mRigeRoRjZjG3CjsZXnRk-_N-Rk6Ti9AkCKWxRkOUR0Akjov9Rl9M7WRlYQ2oAllVYHRlmJYLRloeuKRlrVltRoKrPlAoQB1SAoh6foRoo50PRopojSRp3qGLCpAecIRpO1r0RpS-cURpWvK2Ap_C_mRqBK_tAqCqZOdqS9z2RqSdXHRrzrBLDsZgzeRsgaHDQskzspRtDhTjRtNfYbAtom1SSu-3JrAu-yDaRuO0Z4RuYvAtBuoz83Av2DXwbv2wjGAvDPRPAvH2tlAvHn0zAvKwu1AvS_Q7Avd6T7Avh3YPAvrdkKAw-6G-Aw4R7gAwROSXAwUAWtAweQkIAwhhO5AwjJUaAwmtCZRwn6__RwnQlUAwnvgcRwruoLBx0l5FAxOgpUAxWX7FAxZS4zRxkwTIAxsFqaAy3Dbaay4KkrAy4RHURy8dvdRyAYvKAyGUedAzE667AzGc8KRzVcTSRz_2D0Az_OGtAzd_4nRzeudtAzgOyzAzoI1lAzrEL-R

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -73,6 +73,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
*/
public ReferenceContainerCache(final ReferenceFactory<ReferenceType> factory, final ByteOrder termOrder, final int termSize) {
super(factory);
assert termOrder != null;
this.termOrder = termOrder;
this.termSize = termSize;
this.containerOrder = new ContainerOrder<ReferenceType>(this.termOrder);
@ -130,10 +131,12 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
// write wCache
long wordcount = 0, urlcount = 0;
byte[] term = null, lwh;
assert this.termKeyOrdering() != null;
for (final ReferenceContainer<ReferenceType> container: cachecopy) {
// get entries
lwh = term;
term = container.getTermHash();
if (term == null) continue;
// check consistency: entries must be ordered
assert (lwh == null || this.termKeyOrdering().compare(term, lwh) > 0);

@ -476,36 +476,39 @@ public final class Switchboard extends serverSwitch {
// load distributed ranking
// very large memory configurations allow to re-compute a ranking table
/*
final File hostIndexFile = new File(queuesRoot, "hostIndex.blob");
final File hostIndexFile = new File(this.queuesRoot, "hostIndex.blob");
if (MemoryControl.available() > 1024 * 1024 * 1024) new Thread() {
public void run() {
ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
if (!hostIndexFile.exists()) {
hostIndex = BlockRank.collect(peers, webStructure);
hostIndex = BlockRank.collect(Switchboard.this.peers, Switchboard.this.webStructure, Integer.MAX_VALUE);
BlockRank.saveHostIndex(hostIndex, hostIndexFile);
} else {
hostIndex = BlockRank.loadHostIndex(hostIndexFile);
}
// use an index segment to find hosts for given host hashes
String segmentName = getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
Segment segment = indexSegments.segment(segmentName);
MetadataRepository metadata = segment.urlMetadata();
final String segmentName = getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
final Segment segment = Switchboard.this.indexSegments.segment(segmentName);
final MetadataRepository metadata = segment.urlMetadata();
Map<String,HostStat> hostHashResolver;
try {
hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
} catch (IOException e) {
} catch (final IOException e) {
hostHashResolver = new HashMap<String, HostStat>();
}
// recursively compute a new ranking table
Switchboard.this.log.logInfo("BLOCK RANK: computing new ranking tables...");
BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
hostIndex = null; // we don't need that here any more, so free the memory
// use the web structure and the hostHash resolver to analyse the ranking table
BlockRank.analyse(BlockRank.ybrTables, webStructure, hostHashResolver);
Switchboard.this.log.logInfo("BLOCK RANK: analysis of " + BlockRank.ybrTables.length + " tables...");
BlockRank.analyse(Switchboard.this.webStructure, hostHashResolver);
// store the new table
//BlockRank.storeBlockRankTable(rankingPath);
Switchboard.this.log.logInfo("BLOCK RANK: storing fresh table...");
BlockRank.storeBlockRankTable(rankingPath);
}
}.start();
*/

@ -49,8 +49,8 @@ import net.yacy.peers.Seed;
import net.yacy.peers.SeedDB;
import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
import net.yacy.search.index.Segment;
import net.yacy.search.index.MetadataRepository.HostStat;
import net.yacy.search.index.Segment;
public class BlockRank {
@ -64,32 +64,32 @@ public class BlockRank {
* @param seeds
* @return a merged host index from all peers
*/
public static ReferenceContainerCache<HostReference> collect(final SeedDB seeds, final WebStructureGraph myGraph) {
public static ReferenceContainerCache<HostReference> collect(final SeedDB seeds, final WebStructureGraph myGraph, int maxcount) {
ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
final ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
// start all jobs
Iterator<Seed> si = seeds.seedsConnected(true, false, null, 0.99f);
ArrayList<IndexRetrieval> jobs = new ArrayList<IndexRetrieval>();
while (si.hasNext()) {
IndexRetrieval loader = new IndexRetrieval(index, si.next());
final Iterator<Seed> si = seeds.seedsConnected(true, false, null, 0.99f);
final ArrayList<IndexRetrieval> jobs = new ArrayList<IndexRetrieval>();
while (maxcount-- > 0 && si.hasNext()) {
final IndexRetrieval loader = new IndexRetrieval(index, si.next());
loader.start();
jobs.add(loader);
}
// get the local index
if (myGraph != null) try {
ReferenceContainerCache<HostReference> myIndex = myGraph.incomingReferences();
final ReferenceContainerCache<HostReference> myIndex = myGraph.incomingReferences();
Log.logInfo("BlockRank", "loaded " + myIndex.size() + " host indexes from my peer");
index.merge(myIndex);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
// wait for termination
for (IndexRetrieval job: jobs) try { job.join(); } catch (InterruptedException e) { }
for (final IndexRetrieval job: jobs) try { job.join(); } catch (final InterruptedException e) { }
Log.logInfo("BlockRank", "create " + index.size() + " host indexes from all peers");
return index;
@ -100,20 +100,20 @@ public class BlockRank {
ReferenceContainerCache<HostReference> index;
Seed seed;
public IndexRetrieval(ReferenceContainerCache<HostReference> index, Seed seed) {
public IndexRetrieval(final ReferenceContainerCache<HostReference> index, final Seed seed) {
this.index = index;
this.seed = seed;
}
public void run() {
ReferenceContainerCache<HostReference> partialIndex = Protocol.loadIDXHosts(this.seed);
final ReferenceContainerCache<HostReference> partialIndex = Protocol.loadIDXHosts(this.seed);
if (partialIndex == null || partialIndex.size() == 0) return;
Log.logInfo("BlockRank", "loaded " + partialIndex.size() + " host indexes from peer " + this.seed.getName());
try {
this.index.merge(partialIndex);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
}
@ -124,27 +124,27 @@ public class BlockRank {
* @param index
* @param file
*/
public static void saveHostIndex(ReferenceContainerCache<HostReference> index, File file) {
public static void saveHostIndex(final ReferenceContainerCache<HostReference> index, final File file) {
Log.logInfo("BlockRank", "saving " + index.size() + " host indexes to file " + file.toString());
index.dump(file, Segment.writeBufferSize, false);
Log.logInfo("BlockRank", "saved " + index.size() + " host indexes to file " + file.toString());
}
public static ReferenceContainerCache<HostReference> loadHostIndex(File file) {
public static ReferenceContainerCache<HostReference> loadHostIndex(final File file) {
Log.logInfo("BlockRank", "reading host indexes from file " + file.toString());
ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
final ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
// load from file
try {
ReferenceIterator<HostReference> ri = new ReferenceIterator<HostReference>(file, WebStructureGraph.hostReferenceFactory);
final ReferenceIterator<HostReference> ri = new ReferenceIterator<HostReference>(file, WebStructureGraph.hostReferenceFactory);
while (ri.hasNext()) {
ReferenceContainer<HostReference> references = ri.next();
final ReferenceContainer<HostReference> references = ri.next();
index.add(references);
}
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
@ -152,19 +152,19 @@ public class BlockRank {
return index;
}
public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, Map<String, HostStat> hostHashResolver, BinSearch[] referenceTable, int recusions) {
public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, final Map<String, HostStat> hostHashResolver, final BinSearch[] referenceTable, int recusions) {
// first find out the maximum count of the hostHashResolver
int maxHostCount = 1;
for (HostStat stat: hostHashResolver.values()) {
for (final HostStat stat: hostHashResolver.values()) {
if (stat.count > maxHostCount) maxHostCount = stat.count;
}
// then just count the number of references. all other information from the index is not used because they cannot be trusted
ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering());
final ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering());
HostStat hostStat;
int hostCount;
for (ReferenceContainer<HostReference> container: index) {
for (final ReferenceContainer<HostReference> container: index) {
if (container.size() == 0) continue;
if (referenceTable == null) {
hostStat = hostHashResolver.get(ASCII.String(container.getTermHash()));
@ -172,7 +172,7 @@ public class BlockRank {
hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount);
} else {
int score = 0;
Iterator<HostReference> hri = container.entries();
final Iterator<HostReference> hri = container.entries();
HostReference hr;
while (hri.hasNext()) {
hr = hri.next();
@ -185,24 +185,24 @@ public class BlockRank {
}
// now divide the scores into two halves until the score map is empty
List<BinSearch> table = new ArrayList<BinSearch>();
final List<BinSearch> table = new ArrayList<BinSearch>();
while (hostScore.size() > 10) {
List<byte[]> smallest = hostScore.lowerHalf();
final List<byte[]> smallest = hostScore.lowerHalf();
if (smallest.size() == 0) break; // should never happen but this ensures termination of the loop
Log.logInfo("BlockRank", "index evaluation: computed partition of size " + smallest.size());
table.add(new BinSearch(smallest, 6));
for (byte[] host: smallest) hostScore.delete(host);
for (final byte[] host: smallest) hostScore.delete(host);
}
if (hostScore.size() > 0) {
ArrayList<byte[]> list = new ArrayList<byte[]>();
for (byte[] entry: hostScore) list.add(entry);
final ArrayList<byte[]> list = new ArrayList<byte[]>();
for (final byte[] entry: hostScore) list.add(entry);
Log.logInfo("BlockRank", "index evaluation: computed last partition of size " + list.size());
table.add(new BinSearch(list, 6));
}
// the last table entry has now a list of host hashes that has the most references
int binTables = Math.min(16, table.size());
BinSearch[] newTables = new BinSearch[binTables];
final int binTables = Math.min(16, table.size());
final BinSearch[] newTables = new BinSearch[binTables];
for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1);
// re-use the new table for a recursion
@ -210,13 +210,13 @@ public class BlockRank {
return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step
}
public static void analyse(BinSearch[] tables, final WebStructureGraph myGraph, final Map<String, HostStat> hostHash2hostName) {
public static void analyse(final WebStructureGraph myGraph, final Map<String, HostStat> hostHash2hostName) {
byte[] hosth = new byte[6];
String hosths, hostn;
HostStat hs;
for (int ybr = 0; ybr < tables.length; ybr++) {
row: for (int i = 0; i < tables[ybr].size(); i++) {
hosth = tables[ybr].get(i, hosth);
for (int ybr = 0; ybr < ybrTables.length; ybr++) {
row: for (int i = 0; i < ybrTables[ybr].size(); i++) {
hosth = ybrTables[ybr].get(i, hosth);
hosths = ASCII.String(hosth);
hostn = myGraph.hostHash2hostName(hosths);
if (hostn == null) {
@ -290,7 +290,7 @@ public class BlockRank {
return ranking(hash, ybrTables);
}
public static int ranking(final byte[] hash, BinSearch[] rankingTable) {
public static int ranking(final byte[] hash, final BinSearch[] rankingTable) {
if (rankingTable == null) return 16;
byte[] hosthash;
if (hash.length == 6) {

Loading…
Cancel
Save