updated ranking tables (fresh computation)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8103 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 070d32ae21
commit bc5df0eef5

@ -1 +1 @@
3o06fQFARhBYG1fyaaLgIKhaO0510AhYPjMYsX0sAB 1ZZwWAumBqYA

@ -1 +1 @@
6zDuRYGIdeZYR-J6JRc14c_YpT2NrQy9plPa -6gsqA020wSA1IWRpA2DGiZA3I173D4LHf1A4aCKyCFARhBYIgp3zAKYs_BDKlGntBP4hN7BTWSa6BWOOkgC_aJQZBlqygQCskjqmAyAS1GZ

@ -1 +1 @@
-eu5sA-us6-Y0j9n6Q2dc2nY7694GZLEGNQZR473lZWe1RRBYUGRwYaIwR3Aay29rSoTEsB4vVZ-CRyrZU6Z -dsznA0ffeYa0k8HYA1buONZ1hp4CZ1rmZEB2I2Z4a2OMHFD2dS4GC2snssC2xfZlA2yPtnA3_cOCZ3rYBLA43_QFA4DC3rZ4QT2QA4Xr0oT4aOOeC4gQ-UB4kBsyB4soqED4uvaoD4yEsRB569MNB5A85JD5AFAhB5NS-6D5OYk8A5T24mD5THrdA5qgbGB5sF1bD5unnvB6QOSwC6dTQBB6rptXD6usA-C76no0B7ka2oA7t6KsA9P-MCD9W06lD9x1MdAACBIaAAZvWMAB-1LsCB9ysWbBnGOKDCGjOTACbH3hCDpqPaBE4hAfBE73e0AE8wOQAEF18XDEclAMCEx09oBFLJT1AG-9VGbG121OCGKUAiQGYhaABGw3c_YHIYNRDHfJp2aHkNBlaJbhHIAK9ym_CKSZlzBKXICRBKZjSDDKg1NMZKokAECLpLpxCLpgIEbLzLEwAMXbAIDO1venZO9pUMAP4b63APW0DgAPhE6DDPpqN1BRDGZUARSMevBRZLwbDRyBkdQS5cf9YSA1H_DTQhlWCTQjllBTjKy9BU8tUYBUBU-UYUcb1FCUibu4AUjcguAUxQV7BVW4SCZVa3wTZVd39RAW6mcGQWvYdQbX5NQ7CXnME8aXqinsbXyU37DYTrnFCYvFrgBZceQYRZlEtda_5TwUB_9DWtB_BjKnDb5HNeAbI925CbJ8MBDbk0LFBboVvbBc6LB8Dc8kOWBcVIH9BcyK-7Dd3k77De6dtuAebtYjBep8doBfsVc0Ag-r_RBg7iLSDgBdF4ZgH2ToAg_tp9CgmKYUDgrGhFAhL2xDDiW6C7BicZSNDirHC-CjEI-jDjWy2JAkMqVVBkpOOUBkt_iIAl10fQDl1EgfalDSGZClN8sqAleZuuAm5lRIBmRbv6AmcsyJCmmz4-CmqQctCmxKFhAnOxx8Dnmc3wAnvUXNDnx345Bo2GfQCo2uawAo59ENBoTbaJbpV6YOBpaytfDq8ncZDqh9jGYqswa8BqyohEAr5F6PCr8wkPDr9NbuAroYZAYs5cngBsSmyICsX0sABs_VxeDsm2fwDsoevXCt1drJAt3YuIAt5KYbDtzQqwAuC9y_buMQoSAuPaHWSuT9E5BuisYQDvEp4rYvbE4iZvbmk1BverBJBvozSXRvuELQDvwejWAxawNOCxgso4ByKwk0Byr3IcAysR9FBysqjTBz5Lh6BzXCNNA

@ -1 +1 @@
0kVGgD2JvFCh7eJMoQ7uXagC8I8z7B9VB_qBFp2fRQGThWkZGUAvOCK0oBfCKB4M3AMZ-kjZNANzdAOObI4BQVXU4BVUOQ5AXuknKQ_cbjnCcRaiLZhzy8YQlzZYMQmtlFPBoDRuDCt343uBtDH4rQvHR65BzntqxZ -9a0tA-Ho8KA-IWRhA-N6LGB-RfRyA-WLVRR-pvIiR-zTGUY015OzA0CVWgA0Eck_A0F5BNA0FS6WD0Gf7WA0KHQrA0PMz3I0XgAcR0c-4sC0svxdA18HEbA19GMfA1AYK5A1GTzRA1WS-FA1uitOA1ygfnA1zncVA245hwA249wFA25q33A26YBgA2I4vyA2ILBAA2NyeoQ2PjZIA2UyolR2XVY8A2qM26A36DOEA3HFtDb3Je4gB3Pc7zR3UPfMA3_mhYA3_sELA3d6cwA3eqjQc3ooEUR3rXPsA3vbbOA3ydd7A41Qv4d44TBEA44yqKR47_4CA4BjcUA4KmySA4OwJZR4SMIEd4hW28D4kfwyA4mEdyA52tZEA58gsDA5DW-mA5PxFkA5QBqrA5RXd5A5SCrfA5Z0gIA5aMo2A5j6xBA5urnTR657elA6Cfy1A6OHmmB6PE77A6nYg1A6r62bR6vzwpR6wSzfA718X6A72gqQR76A2hT78gDIA7P3bNA7QRGAR7VlyiA885vKA8GQxVA8XBdpD8Xy6JA8xeLyA9-C6bZ94Ul4R9Fz0TA9PEwbR9aOzyc9dQuuR9jp3RA9qWyrA9rncTRAeoi-BC7kqvRCY2c1RCmbCgRCv0a1ADUIDrRDdMYvBDqWXjBE1SP3AE7iKlIEF8TAAEr2ZKREx_F6AF0ctZdF43jyRFHzvbRFNgKsRFxvahAG1JUicGAISPAGOsmRRGca52RGnWhFRH6wH8RI2Y5OAIgCE6dIj-97RInSN4ZJa0O_RK0KytRKJNAWAKRMIGRKWoccRLNOmMRLfQpzRM4zGZRMuecrAMy_0nYN-Q06RNVkbvRNxlJVRO6ZOBROFHCGRORipz4Od1hPROra0QROrgT9SOw50sROzLRuRPQNvdRPQr5MDPsK4aYPt81sRQ9SYcRQB6BmRQB6BmdQExs3RQHrJ3RQrPWDYQtDCeRR8gvxRRCyYyRRqwHERSSPtPRSVShmRSz7sxRTE6zHRTNBKMRTm8mkRTtpu-RUcqoyRUiwrFDVHUyHBVnuDNAVnuWKRVv-ziAW87FcRW9-QvRX7t7PAXPZo8RXSojIRXU2jURXmye6AY0HdSZY1b2BRYFYX4RYbg8GRZAumJRZNADHRZhnwNR_6JtuA_HyQ7A_O63wA_i190A_qwbfAa7-5QRaA-43RaNsjxRamVE8Qan3k9RbdvLoBcRr37DcvlccRcziVuRd_BoCRdvZwyReO0LKReWCuUAebXJGaehw1aRfIu9GYfbgqaRgRTGCAgWRHVRgYdleRiUOLbRiW10mRigeRoRjZjG3CjsZXnRk-_N-Rk6Ti9AkCKWxRkOUR0Akjov9Rl9M7WRlYQ2oAllVYHRlmJYLRloeuKRlrVltRoKrPlAoQB1SAoh6foRoo50PRopojSRp3qGLCpAecIRpO1r0RpS-cURpWvK2Ap_C_mRqBK_tAqCqZOdqS9z2RqSdXHRrzrBLDsZgzeRsgaHDQskzspRtDhTjRtNfYbAtom1SSu-3JrAu-yDaRuO0Z4RuYvAtBuoz83Av2DXwbv2wjGAvDPRPAvH2tlAvHn0zAvKwu1AvS_Q7Avd6T7Avh3YPAvrdkKAw-6G-Aw4R7gAwROSXAwUAWtAweQkIAwhhO5AwjJUaAwmtCZRwn6__RwnQlUAwnvgcRwruoLBx0l5FAxOgpUAxWX7FAxZS4zRxkwTIAxsFqaAy3Dbaay4KkrAy4RHURy8dvdRyAYvKAyGUedAzE667AzGc8KRzVcTSRz_2D0Az_OGtAzd_4nRzeudtAzgOyzAzoI1lAzrEL-R

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -73,6 +73,7 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
*/ */
public ReferenceContainerCache(final ReferenceFactory<ReferenceType> factory, final ByteOrder termOrder, final int termSize) { public ReferenceContainerCache(final ReferenceFactory<ReferenceType> factory, final ByteOrder termOrder, final int termSize) {
super(factory); super(factory);
assert termOrder != null;
this.termOrder = termOrder; this.termOrder = termOrder;
this.termSize = termSize; this.termSize = termSize;
this.containerOrder = new ContainerOrder<ReferenceType>(this.termOrder); this.containerOrder = new ContainerOrder<ReferenceType>(this.termOrder);
@ -130,10 +131,12 @@ public final class ReferenceContainerCache<ReferenceType extends Reference> exte
// write wCache // write wCache
long wordcount = 0, urlcount = 0; long wordcount = 0, urlcount = 0;
byte[] term = null, lwh; byte[] term = null, lwh;
assert this.termKeyOrdering() != null;
for (final ReferenceContainer<ReferenceType> container: cachecopy) { for (final ReferenceContainer<ReferenceType> container: cachecopy) {
// get entries // get entries
lwh = term; lwh = term;
term = container.getTermHash(); term = container.getTermHash();
if (term == null) continue;
// check consistency: entries must be ordered // check consistency: entries must be ordered
assert (lwh == null || this.termKeyOrdering().compare(term, lwh) > 0); assert (lwh == null || this.termKeyOrdering().compare(term, lwh) > 0);

@ -476,36 +476,39 @@ public final class Switchboard extends serverSwitch {
// load distributed ranking // load distributed ranking
// very large memory configurations allow to re-compute a ranking table // very large memory configurations allow to re-compute a ranking table
/* /*
final File hostIndexFile = new File(queuesRoot, "hostIndex.blob"); final File hostIndexFile = new File(this.queuesRoot, "hostIndex.blob");
if (MemoryControl.available() > 1024 * 1024 * 1024) new Thread() { if (MemoryControl.available() > 1024 * 1024 * 1024) new Thread() {
public void run() { public void run() {
ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now ReferenceContainerCache<HostReference> hostIndex; // this will get large, more than 0.5 million entries by now
if (!hostIndexFile.exists()) { if (!hostIndexFile.exists()) {
hostIndex = BlockRank.collect(peers, webStructure); hostIndex = BlockRank.collect(Switchboard.this.peers, Switchboard.this.webStructure, Integer.MAX_VALUE);
BlockRank.saveHostIndex(hostIndex, hostIndexFile); BlockRank.saveHostIndex(hostIndex, hostIndexFile);
} else { } else {
hostIndex = BlockRank.loadHostIndex(hostIndexFile); hostIndex = BlockRank.loadHostIndex(hostIndexFile);
} }
// use an index segment to find hosts for given host hashes // use an index segment to find hosts for given host hashes
String segmentName = getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default"); final String segmentName = getConfig(SwitchboardConstants.SEGMENT_PUBLIC, "default");
Segment segment = indexSegments.segment(segmentName); final Segment segment = Switchboard.this.indexSegments.segment(segmentName);
MetadataRepository metadata = segment.urlMetadata(); final MetadataRepository metadata = segment.urlMetadata();
Map<String,HostStat> hostHashResolver; Map<String,HostStat> hostHashResolver;
try { try {
hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector()); hostHashResolver = metadata.domainHashResolver(metadata.domainSampleCollector());
} catch (IOException e) { } catch (final IOException e) {
hostHashResolver = new HashMap<String, HostStat>(); hostHashResolver = new HashMap<String, HostStat>();
} }
// recursively compute a new ranking table // recursively compute a new ranking table
Switchboard.this.log.logInfo("BLOCK RANK: computing new ranking tables...");
BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0); BlockRank.ybrTables = BlockRank.evaluate(hostIndex, hostHashResolver, null, 0);
hostIndex = null; // we don't need that here any more, so free the memory hostIndex = null; // we don't need that here any more, so free the memory
// use the web structure and the hostHash resolver to analyse the ranking table // use the web structure and the hostHash resolver to analyse the ranking table
BlockRank.analyse(BlockRank.ybrTables, webStructure, hostHashResolver); Switchboard.this.log.logInfo("BLOCK RANK: analysis of " + BlockRank.ybrTables.length + " tables...");
BlockRank.analyse(Switchboard.this.webStructure, hostHashResolver);
// store the new table // store the new table
//BlockRank.storeBlockRankTable(rankingPath); Switchboard.this.log.logInfo("BLOCK RANK: storing fresh table...");
BlockRank.storeBlockRankTable(rankingPath);
} }
}.start(); }.start();
*/ */

@ -2,7 +2,7 @@
* BlockRankCollector * BlockRankCollector
* Copyright 2011 by Michael Christen * Copyright 2011 by Michael Christen
* First released 18.05.2011 at http://yacy.net * First released 18.05.2011 at http://yacy.net
* *
* $LastChangedDate: 2011-04-26 19:39:16 +0200 (Di, 26 Apr 2011) $ * $LastChangedDate: 2011-04-26 19:39:16 +0200 (Di, 26 Apr 2011) $
* $LastChangedRevision: 7676 $ * $LastChangedRevision: 7676 $
* $LastChangedBy: orbiter $ * $LastChangedBy: orbiter $
@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version. * version 2.1 of the License, or (at your option) any later version.
* *
* This library is distributed in the hope that it will be useful, * This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt * along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
@ -49,71 +49,71 @@ import net.yacy.peers.Seed;
import net.yacy.peers.SeedDB; import net.yacy.peers.SeedDB;
import net.yacy.peers.graphics.WebStructureGraph; import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.peers.graphics.WebStructureGraph.HostReference; import net.yacy.peers.graphics.WebStructureGraph.HostReference;
import net.yacy.search.index.Segment;
import net.yacy.search.index.MetadataRepository.HostStat; import net.yacy.search.index.MetadataRepository.HostStat;
import net.yacy.search.index.Segment;
public class BlockRank { public class BlockRank {
public static BinSearch[] ybrTables = null; // block-rank tables public static BinSearch[] ybrTables = null; // block-rank tables
/** /**
* collect host index information from other peers. All peers in the seed database are asked * collect host index information from other peers. All peers in the seed database are asked
* this may take some time; please wait up to one minute * this may take some time; please wait up to one minute
* @param seeds * @param seeds
* @return a merged host index from all peers * @return a merged host index from all peers
*/ */
public static ReferenceContainerCache<HostReference> collect(final SeedDB seeds, final WebStructureGraph myGraph) { public static ReferenceContainerCache<HostReference> collect(final SeedDB seeds, final WebStructureGraph myGraph, int maxcount) {
ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6); final ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
// start all jobs // start all jobs
Iterator<Seed> si = seeds.seedsConnected(true, false, null, 0.99f); final Iterator<Seed> si = seeds.seedsConnected(true, false, null, 0.99f);
ArrayList<IndexRetrieval> jobs = new ArrayList<IndexRetrieval>(); final ArrayList<IndexRetrieval> jobs = new ArrayList<IndexRetrieval>();
while (si.hasNext()) { while (maxcount-- > 0 && si.hasNext()) {
IndexRetrieval loader = new IndexRetrieval(index, si.next()); final IndexRetrieval loader = new IndexRetrieval(index, si.next());
loader.start(); loader.start();
jobs.add(loader); jobs.add(loader);
} }
// get the local index // get the local index
if (myGraph != null) try { if (myGraph != null) try {
ReferenceContainerCache<HostReference> myIndex = myGraph.incomingReferences(); final ReferenceContainerCache<HostReference> myIndex = myGraph.incomingReferences();
Log.logInfo("BlockRank", "loaded " + myIndex.size() + " host indexes from my peer"); Log.logInfo("BlockRank", "loaded " + myIndex.size() + " host indexes from my peer");
index.merge(myIndex); index.merge(myIndex);
} catch (IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} catch (RowSpaceExceededException e) { } catch (final RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
} }
// wait for termination // wait for termination
for (IndexRetrieval job: jobs) try { job.join(); } catch (InterruptedException e) { } for (final IndexRetrieval job: jobs) try { job.join(); } catch (final InterruptedException e) { }
Log.logInfo("BlockRank", "create " + index.size() + " host indexes from all peers"); Log.logInfo("BlockRank", "create " + index.size() + " host indexes from all peers");
return index; return index;
} }
public static class IndexRetrieval extends Thread { public static class IndexRetrieval extends Thread {
ReferenceContainerCache<HostReference> index; ReferenceContainerCache<HostReference> index;
Seed seed; Seed seed;
public IndexRetrieval(ReferenceContainerCache<HostReference> index, Seed seed) { public IndexRetrieval(final ReferenceContainerCache<HostReference> index, final Seed seed) {
this.index = index; this.index = index;
this.seed = seed; this.seed = seed;
} }
public void run() { public void run() {
ReferenceContainerCache<HostReference> partialIndex = Protocol.loadIDXHosts(this.seed); final ReferenceContainerCache<HostReference> partialIndex = Protocol.loadIDXHosts(this.seed);
if (partialIndex == null || partialIndex.size() == 0) return; if (partialIndex == null || partialIndex.size() == 0) return;
Log.logInfo("BlockRank", "loaded " + partialIndex.size() + " host indexes from peer " + this.seed.getName()); Log.logInfo("BlockRank", "loaded " + partialIndex.size() + " host indexes from peer " + this.seed.getName());
try { try {
this.index.merge(partialIndex); this.index.merge(partialIndex);
} catch (IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} catch (RowSpaceExceededException e) { } catch (final RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
} }
} }
@ -124,47 +124,47 @@ public class BlockRank {
* @param index * @param index
* @param file * @param file
*/ */
public static void saveHostIndex(ReferenceContainerCache<HostReference> index, File file) { public static void saveHostIndex(final ReferenceContainerCache<HostReference> index, final File file) {
Log.logInfo("BlockRank", "saving " + index.size() + " host indexes to file " + file.toString()); Log.logInfo("BlockRank", "saving " + index.size() + " host indexes to file " + file.toString());
index.dump(file, Segment.writeBufferSize, false); index.dump(file, Segment.writeBufferSize, false);
Log.logInfo("BlockRank", "saved " + index.size() + " host indexes to file " + file.toString()); Log.logInfo("BlockRank", "saved " + index.size() + " host indexes to file " + file.toString());
} }
public static ReferenceContainerCache<HostReference> loadHostIndex(File file) { public static ReferenceContainerCache<HostReference> loadHostIndex(final File file) {
Log.logInfo("BlockRank", "reading host indexes from file " + file.toString()); Log.logInfo("BlockRank", "reading host indexes from file " + file.toString());
ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6); final ReferenceContainerCache<HostReference> index = new ReferenceContainerCache<HostReference>(WebStructureGraph.hostReferenceFactory, Base64Order.enhancedCoder, 6);
// load from file // load from file
try { try {
ReferenceIterator<HostReference> ri = new ReferenceIterator<HostReference>(file, WebStructureGraph.hostReferenceFactory); final ReferenceIterator<HostReference> ri = new ReferenceIterator<HostReference>(file, WebStructureGraph.hostReferenceFactory);
while (ri.hasNext()) { while (ri.hasNext()) {
ReferenceContainer<HostReference> references = ri.next(); final ReferenceContainer<HostReference> references = ri.next();
index.add(references); index.add(references);
} }
} catch (IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} catch (RowSpaceExceededException e) { } catch (final RowSpaceExceededException e) {
Log.logException(e); Log.logException(e);
} }
Log.logInfo("BlockRank", "read " + index.size() + " host indexes from file " + file.toString()); Log.logInfo("BlockRank", "read " + index.size() + " host indexes from file " + file.toString());
return index; return index;
} }
public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, Map<String, HostStat> hostHashResolver, BinSearch[] referenceTable, int recusions) { public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, final Map<String, HostStat> hostHashResolver, final BinSearch[] referenceTable, int recusions) {
// first find out the maximum count of the hostHashResolver // first find out the maximum count of the hostHashResolver
int maxHostCount = 1; int maxHostCount = 1;
for (HostStat stat: hostHashResolver.values()) { for (final HostStat stat: hostHashResolver.values()) {
if (stat.count > maxHostCount) maxHostCount = stat.count; if (stat.count > maxHostCount) maxHostCount = stat.count;
} }
// then just count the number of references. all other information from the index is not used because they cannot be trusted // then just count the number of references. all other information from the index is not used because they cannot be trusted
ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering()); final ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering());
HostStat hostStat; HostStat hostStat;
int hostCount; int hostCount;
for (ReferenceContainer<HostReference> container: index) { for (final ReferenceContainer<HostReference> container: index) {
if (container.size() == 0) continue; if (container.size() == 0) continue;
if (referenceTable == null) { if (referenceTable == null) {
hostStat = hostHashResolver.get(ASCII.String(container.getTermHash())); hostStat = hostHashResolver.get(ASCII.String(container.getTermHash()));
@ -172,7 +172,7 @@ public class BlockRank {
hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount); hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount);
} else { } else {
int score = 0; int score = 0;
Iterator<HostReference> hri = container.entries(); final Iterator<HostReference> hri = container.entries();
HostReference hr; HostReference hr;
while (hri.hasNext()) { while (hri.hasNext()) {
hr = hri.next(); hr = hri.next();
@ -183,40 +183,40 @@ public class BlockRank {
hostScore.set(container.getTermHash(), score); hostScore.set(container.getTermHash(), score);
} }
} }
// now divide the scores into two halves until the score map is empty // now divide the scores into two halves until the score map is empty
List<BinSearch> table = new ArrayList<BinSearch>(); final List<BinSearch> table = new ArrayList<BinSearch>();
while (hostScore.size() > 10) { while (hostScore.size() > 10) {
List<byte[]> smallest = hostScore.lowerHalf(); final List<byte[]> smallest = hostScore.lowerHalf();
if (smallest.size() == 0) break; // should never happen but this ensures termination of the loop if (smallest.size() == 0) break; // should never happen but this ensures termination of the loop
Log.logInfo("BlockRank", "index evaluation: computed partition of size " + smallest.size()); Log.logInfo("BlockRank", "index evaluation: computed partition of size " + smallest.size());
table.add(new BinSearch(smallest, 6)); table.add(new BinSearch(smallest, 6));
for (byte[] host: smallest) hostScore.delete(host); for (final byte[] host: smallest) hostScore.delete(host);
} }
if (hostScore.size() > 0) { if (hostScore.size() > 0) {
ArrayList<byte[]> list = new ArrayList<byte[]>(); final ArrayList<byte[]> list = new ArrayList<byte[]>();
for (byte[] entry: hostScore) list.add(entry); for (final byte[] entry: hostScore) list.add(entry);
Log.logInfo("BlockRank", "index evaluation: computed last partition of size " + list.size()); Log.logInfo("BlockRank", "index evaluation: computed last partition of size " + list.size());
table.add(new BinSearch(list, 6)); table.add(new BinSearch(list, 6));
} }
// the last table entry has now a list of host hashes that has the most references // the last table entry has now a list of host hashes that has the most references
int binTables = Math.min(16, table.size()); final int binTables = Math.min(16, table.size());
BinSearch[] newTables = new BinSearch[binTables]; final BinSearch[] newTables = new BinSearch[binTables];
for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1); for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1);
// re-use the new table for a recursion // re-use the new table for a recursion
if (recusions == 0) return newTables; if (recusions == 0) return newTables;
return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step
} }
public static void analyse(BinSearch[] tables, final WebStructureGraph myGraph, final Map<String, HostStat> hostHash2hostName) { public static void analyse(final WebStructureGraph myGraph, final Map<String, HostStat> hostHash2hostName) {
byte[] hosth = new byte[6]; byte[] hosth = new byte[6];
String hosths, hostn; String hosths, hostn;
HostStat hs; HostStat hs;
for (int ybr = 0; ybr < tables.length; ybr++) { for (int ybr = 0; ybr < ybrTables.length; ybr++) {
row: for (int i = 0; i < tables[ybr].size(); i++) { row: for (int i = 0; i < ybrTables[ybr].size(); i++) {
hosth = tables[ybr].get(i, hosth); hosth = ybrTables[ybr].get(i, hosth);
hosths = ASCII.String(hosth); hosths = ASCII.String(hosth);
hostn = myGraph.hostHash2hostName(hosths); hostn = myGraph.hostHash2hostName(hosths);
if (hostn == null) { if (hostn == null) {
@ -231,8 +231,8 @@ public class BlockRank {
} }
} }
} }
/** /**
* load YaCy Block Rank tables * load YaCy Block Rank tables
* These tables have a very simple structure: every file is a sequence of Domain hashes, ordered by b64. * These tables have a very simple structure: every file is a sequence of Domain hashes, ordered by b64.
@ -258,7 +258,7 @@ public class BlockRank {
} catch (final IOException e) { } catch (final IOException e) {
} }
} }
public static void storeBlockRankTable(final File rankingPath) { public static void storeBlockRankTable(final File rankingPath) {
String ybrName; String ybrName;
File f; File f;
@ -280,7 +280,7 @@ public class BlockRank {
} catch (final IOException e) { } catch (final IOException e) {
} }
} }
/** /**
* returns the YBR ranking value in a range of 0..15, where 0 means best ranking and 15 means worst ranking * returns the YBR ranking value in a range of 0..15, where 0 means best ranking and 15 means worst ranking
* @param hash * @param hash
@ -289,8 +289,8 @@ public class BlockRank {
public static int ranking(final byte[] hash) { public static int ranking(final byte[] hash) {
return ranking(hash, ybrTables); return ranking(hash, ybrTables);
} }
public static int ranking(final byte[] hash, BinSearch[] rankingTable) { public static int ranking(final byte[] hash, final BinSearch[] rankingTable) {
if (rankingTable == null) return 16; if (rankingTable == null) return 16;
byte[] hosthash; byte[] hosthash;
if (hash.length == 6) { if (hash.length == 6) {

Loading…
Cancel
Save