diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index d36cf995a..948d9ce85 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -32,7 +32,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Date; import java.util.HashMap; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -56,7 +55,6 @@ import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.solr.instance.ShardInstance; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.sorting.ReversibleScoreMap; -import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.storage.ZIPReader; import net.yacy.cora.storage.ZIPWriter; @@ -87,7 +85,6 @@ public final class Fulltext { private final File segmentPath; private final File archivePath; private Export exportthread; // will have a export thread assigned if exporter is running - private ArrayList statsDump; private InstanceMirror solrInstances; private final CollectionConfiguration collectionConfiguration; private final WebgraphConfiguration webgraphConfiguration; @@ -98,7 +95,6 @@ public final class Fulltext { this.segmentPath = segmentPath; this.archivePath = archivePath; this.exportthread = null; // will have a export thread assigned if exporter is running - this.statsDump = null; this.solrInstances = new InstanceMirror(); this.collectionConfiguration = collectionConfiguration; this.webgraphConfiguration = webgraphConfiguration; @@ -206,9 +202,7 @@ public final class Fulltext { } public void clearCaches() { - if (this.statsDump != null) this.statsDump.clear(); this.solrInstances.clearCaches(); - this.statsDump = null; } public void clearLocalSolr() throws IOException { @@ -261,7 +255,6 @@ public final class Fulltext { } public void close() { - this.statsDump = null; try { this.solrInstances.close(); } catch (Throwable e) {} @@ -347,7 +340,6 @@ public final class Fulltext { } catch (final SolrException e) { throw new IOException(e.getMessage(), e); } - this.statsDump = null; if (MemoryControl.shortStatus()) clearCaches(); } @@ -359,7 +351,6 @@ public final class Fulltext { } catch (final SolrException e) { throw new IOException(e.getMessage(), e); } - this.statsDump = null; if (MemoryControl.shortStatus()) clearCaches(); } @@ -378,7 +369,6 @@ public final class Fulltext { } catch (final SolrException e) { throw new IOException(e.getMessage(), e); } - this.statsDump = null; if (MemoryControl.shortStatus()) clearCaches(); } @@ -398,16 +388,6 @@ public final class Fulltext { if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_id_s.getSolrFieldName(), hosthashes, (freshdate == null || freshdate.after(now)) ? null : (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); - - // remove the line with statistics - if (Fulltext.this.statsDump != null) { - final Iterator hsi = Fulltext.this.statsDump.iterator(); - HostStat hs; - while (hsi.hasNext()) { - hs = hsi.next(); - if (hosthashes.contains(hs.hosthash)) hsi.remove(); - } - } } public void deleteStaleDomainNames(final Set hostnames, Date freshdate) { @@ -419,16 +399,6 @@ public final class Fulltext { if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_s.getSolrFieldName(), hostnames, (freshdate == null || freshdate.after(now)) ? null : (WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]")); - - // finally remove the line with statistics - if (Fulltext.this.statsDump != null) { - final Iterator hsi = Fulltext.this.statsDump.iterator(); - HostStat hs; - while (hsi.hasNext()) { - hs = hsi.next(); - if (hostnames.contains(hs.hostname)) hsi.remove(); - } - } } /** @@ -790,42 +760,5 @@ public final class Fulltext { } } - - public Iterator statistics(int count, final ScoreMap domainScore) { - // prevent too heavy IO. - if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator(); - - // fetch urls from the database to determine the host in clear text - final Iterator j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first) - String urlhash; - count += 10; // make some more to prevent that we have to do this again after deletions too soon. - if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size(); - this.statsDump = new ArrayList(); - DigestURL url; - while (j.hasNext()) { - urlhash = j.next(); - if (urlhash == null) continue; - url = this.getURL(ASCII.getBytes(urlhash)); - if (url == null || url.getHost() == null) continue; - if (this.statsDump == null) return new ArrayList().iterator(); // some other operation has destroyed the object - this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash))); - count--; - if (count == 0) break; - } - // finally return an iterator for the result array - return (this.statsDump == null) ? new ArrayList().iterator() : this.statsDump.iterator(); - } - - public static class HostStat { - public String hostname, hosthash; - public int port; - public int count; - private HostStat(final String host, final int port, final String urlhashfragment, final int count) { - assert urlhashfragment.length() == 6; - this.hostname = host; - this.port = port; - this.hosthash = urlhashfragment; - this.count = count; - } - } + } diff --git a/source/net/yacy/search/ranking/BlockRank.java b/source/net/yacy/search/ranking/BlockRank.java index c59e5d743..6311e6b25 100644 --- a/source/net/yacy/search/ranking/BlockRank.java +++ b/source/net/yacy/search/ranking/BlockRank.java @@ -29,13 +29,8 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; -import java.util.List; -import java.util.Map; -import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; -import net.yacy.cora.sorting.OrderedScoreMap; -import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.index.BinSearch; @@ -47,10 +42,8 @@ import net.yacy.peers.Seed; import net.yacy.peers.SeedDB; import net.yacy.peers.graphics.WebStructureGraph; import net.yacy.peers.graphics.WebStructureGraph.HostReference; -import net.yacy.search.index.Fulltext.HostStat; import net.yacy.search.index.Segment; - public class BlockRank { /** @@ -149,64 +142,6 @@ public class BlockRank { return index; } - public static BinSearch[] evaluate(final ReferenceContainerCache index, final Map hostHashResolver, final BinSearch[] referenceTable, int recusions) { - - // first find out the maximum count of the hostHashResolver - int maxHostCount = 1; - for (final HostStat stat: hostHashResolver.values()) { - if (stat.count > maxHostCount) maxHostCount = stat.count; - } - - // then just count the number of references. all other information from the index is not used because they cannot be trusted - final ScoreMap hostScore = new OrderedScoreMap(index.termKeyOrdering()); - HostStat hostStat; - int hostCount; - for (final ReferenceContainer container: index) { - if (container.isEmpty()) continue; - if (referenceTable == null) { - hostStat = hostHashResolver.get(ASCII.String(container.getTermHash())); - hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count); - hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount); - } else { - int score = 0; - final Iterator hri = container.entries(); - HostReference hr; - while (hri.hasNext()) { - hr = hri.next(); - hostStat = hostHashResolver.get(ASCII.String(hr.urlhash())); - hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count); - score += (17 - ranking(hr.urlhash(), referenceTable)) * maxHostCount / hostCount; - } - hostScore.set(container.getTermHash(), score); - } - } - - // now divide the scores into two halves until the score map is empty - final List table = new ArrayList(); - while (hostScore.size() > 10) { - final List smallest = hostScore.lowerHalf(); - if (smallest.isEmpty()) break; // should never happen but this ensures termination of the loop - ConcurrentLog.info("BlockRank", "index evaluation: computed partition of size " + smallest.size()); - table.add(new BinSearch(smallest, 6)); - for (final byte[] host: smallest) hostScore.delete(host); - } - if (!hostScore.isEmpty()) { - final ArrayList list = new ArrayList(); - for (final byte[] entry: hostScore) list.add(entry); - ConcurrentLog.info("BlockRank", "index evaluation: computed last partition of size " + list.size()); - table.add(new BinSearch(list, 6)); - } - - // the last table entry has now a list of host hashes that has the most references - final int binTables = Math.min(16, table.size()); - final BinSearch[] newTables = new BinSearch[binTables]; - for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1); - - // re-use the new table for a recursion - if (recusions == 0) return newTables; - return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step - } - public static int ranking(final byte[] hash, final BinSearch[] rankingTable) { if (rankingTable == null) return 16; byte[] hosthash;