diff --git a/source/de/anomic/kelondro/text/ReferenceOrder.java b/source/de/anomic/kelondro/text/ReferenceOrder.java index 0c4fde03f..e1a77b92d 100644 --- a/source/de/anomic/kelondro/text/ReferenceOrder.java +++ b/source/de/anomic/kelondro/text/ReferenceOrder.java @@ -26,11 +26,11 @@ package de.anomic.kelondro.text; -import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; import de.anomic.document.Condenser; import de.anomic.kelondro.order.Bitfield; @@ -43,8 +43,8 @@ import de.anomic.search.RankingProcess; import de.anomic.yacy.yacyURL; public class ReferenceOrder { - private WordReferenceVars min, max; - private final RankingProfile ranking; + private WordReferenceVars min, max; + private final RankingProfile ranking; private final ScoreCluster doms; // collected for "authority" heuristic private int maxdomcount; private String language; @@ -58,8 +58,75 @@ public class ReferenceOrder { this.language = language; } + public class Normalizer extends Thread { + + private ReferenceContainer container; + private BlockingQueue decodedEntries; + + public Normalizer(final ReferenceContainer container) { + // normalize ranking: find minimum and maximum of separate ranking criteria + assert (container != null); + this.container = container; + this.decodedEntries = new LinkedBlockingQueue(); + } + + public void run() { + BlockingQueue vars = WordReferenceVars.transform(container); + + WordReferenceVars entryMin = null; + WordReferenceVars entryMax = null; + HashMap doms0 = new HashMap(); + Integer int1 = 1; + + WordReferenceVars iEntry; + String dom; + Integer count; + try { + while ((iEntry = vars.take()) != WordReferenceVars.poison) { + decodedEntries.put(iEntry); + // find min/max + if (entryMin == null) entryMin = iEntry.clone(); else entryMin.min(iEntry); + if (entryMax == null) entryMax = iEntry.clone(); else entryMax.max(iEntry); + // update domcount + dom = iEntry.metadataHash().substring(6); + count = doms0.get(dom); + if (count == null) { + doms0.put(dom, int1); + } else { + doms0.put(dom, Integer.valueOf(count.intValue() + 1)); + } + } + } catch (InterruptedException e) {} + + if (min == null) min = entryMin.clone(); else min.min(entryMin); + if (max == null) max = entryMax.clone(); else max.max(entryMax); + Map.Entry entry; + final Iterator> di = doms0.entrySet().iterator(); + while (di.hasNext()) { + entry = di.next(); + doms.addScore(entry.getKey(), (entry.getValue()).intValue()); + } + + if (doms.size() > 0) maxdomcount = doms.getMaxScore(); + try { + decodedEntries.put(WordReferenceVars.poison); + } catch (InterruptedException e) {} + } + + public BlockingQueue decoded() { + return this.decodedEntries; + } + } + + public BlockingQueue normalizeWith(final ReferenceContainer container) { + Normalizer n = new Normalizer(container); + n.start(); + return n.decoded(); + } + + /* public ArrayList normalizeWith(final ReferenceContainer container) { - // normalize ranking: find minimum and maxiumum of separate ranking criteria + // normalize ranking: find minimum and maximum of separate ranking criteria assert (container != null); BlockingQueue vars = WordReferenceVars.transform(container); @@ -101,6 +168,7 @@ public class ReferenceOrder { if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore(); return decodedEntries; } + */ public int authority(final String urlHash) { return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount); @@ -154,42 +222,4 @@ public class ReferenceOrder { // this is to patch a bad language name setting that was used in 0.60 and before if (l.equals("uk")) return "en"; else return l; } - - public static class minmaxfinder { - - private WordReferenceVars entryMin; - private WordReferenceVars entryMax; - private final HashMap doms; - private final Integer int1; - private final ArrayList decodedEntries; - - public minmaxfinder(final BlockingQueue vars) { - this.doms = new HashMap(); - this.int1 = 1; - this.decodedEntries = new ArrayList(); - this.entryMin = null; - this.entryMax = null; - WordReferenceVars iEntry; - String dom; - Integer count; - try { - while ((iEntry = vars.take()) != WordReferenceVars.poison) { - this.decodedEntries.add(iEntry); - // find min/max - if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry); - if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry); - // update domcount - dom = iEntry.metadataHash().substring(6); - count = doms.get(dom); - if (count == null) { - doms.put(dom, int1); - } else { - doms.put(dom, Integer.valueOf(count.intValue() + 1)); - } - } - } catch (InterruptedException e) {} - } - - } - } diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 00cb03a11..eccc09916 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -37,6 +37,7 @@ import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import de.anomic.document.Condenser; @@ -165,83 +166,96 @@ public final class RankingProcess extends Thread { long timer = System.currentTimeMillis(); // normalize entries - final ArrayList decodedEntries = this.order.normalizeWith(index); + final BlockingQueue decodedEntries = this.order.normalizeWith(index); serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false); // iterate over normalized entries and select some that are better than currently stored timer = System.currentTimeMillis(); - Long r; HostInfo hs; String domhash; boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0; - for (WordReferenceVars iEntry: decodedEntries) { - assert (iEntry.metadataHash().length() == index.row().primaryKeyLength); - //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; + WordReferenceVars iEntry; + final ArrayList filteredEntries = new ArrayList(); + // apply all filter + try { + while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) { + assert (iEntry.metadataHash().length() == index.row().primaryKeyLength); + //if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue; - // increase flag counts - for (int j = 0; j < 32; j++) { - if (iEntry.flags().get(j)) {flagcount[j]++;} - } - - // kick out entries that are too bad according to current findings - r = Long.valueOf(order.cardinal(iEntry)); - if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue; - - // check constraints - if (!testFlags(iEntry)) continue; - - // check document domain - if (query.contentdom != QueryParams.CONTENTDOM_TEXT) { - if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue; - if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue; - if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue; - if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue; - } + // increase flag counts + for (int j = 0; j < 32; j++) { + if (iEntry.flags().get(j)) {flagcount[j]++;} + } + + // check constraints + if (!testFlags(iEntry)) continue; + + // check document domain + if (query.contentdom != QueryParams.CONTENTDOM_TEXT) { + if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue; + if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue; + if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue; + if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue; + } - // check tld domain - if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) { - // filter out all tld that do not match with wanted tld domain - continue; - } - - // check site constraints - if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) { - // filter out all domains that do not match with the site constraint - continue; - } - - // count domZones - this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++; - - // get statistics for host navigator - if (nav_hosts) { - domhash = iEntry.urlHash.substring(6); - hs = this.hostNavigator.get(domhash); - if (hs == null) { - this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash)); - } else { - hs.inc(); - } - } - - // insert - if ((maxentries < 0) || (stack.size() < maxentries)) { - // in case that we don't have enough yet, accept any new entry - if (urlhashes.containsKey(iEntry.metadataHash())) continue; - stack.push(iEntry, r); - } else { - // if we already have enough entries, insert only such that are necessary to get a better result - if (stack.bottom(r.longValue())) { - continue; - } - // double-check - if (urlhashes.containsKey(iEntry.metadataHash())) continue; - stack.push(iEntry, r); - } - - // increase counter for statistics - if (!local) this.remote_indexCount++; - } + // check tld domain + if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) { + // filter out all tld that do not match with wanted tld domain + continue; + } + + // check site constraints + if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) { + // filter out all domains that do not match with the site constraint + continue; + } + + // count domZones + this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++; + + // get statistics for host navigator + if (nav_hosts) { + domhash = iEntry.urlHash.substring(6); + hs = this.hostNavigator.get(domhash); + if (hs == null) { + this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash)); + } else { + hs.inc(); + } + } + + // accept + filteredEntries.add(iEntry); + + // increase counter for statistics + if (!local) this.remote_indexCount++; + } + } catch (InterruptedException e) {} + + // do the ranking + Long r; + for (WordReferenceVars fEntry: filteredEntries) { + + // kick out entries that are too bad according to current findings + r = Long.valueOf(order.cardinal(fEntry)); + if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue; + + // insert + if ((maxentries < 0) || (stack.size() < maxentries)) { + // in case that we don't have enough yet, accept any new entry + if (urlhashes.containsKey(fEntry.metadataHash())) continue; + stack.push(fEntry, r); + } else { + // if we already have enough entries, insert only such that are necessary to get a better result + if (stack.bottom(r.longValue())) { + continue; + } + // double-check + if (urlhashes.containsKey(fEntry.metadataHash())) continue; + stack.push(fEntry, r); + } + + } //if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true); serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);