separated rwi constraint evaluation from rwi ranking and added concurrency

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6274 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent ce7924d712
commit 0ba1beaf56

@ -26,11 +26,11 @@
package de.anomic.kelondro.text;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.document.Condenser;
import de.anomic.kelondro.order.Bitfield;
@ -43,8 +43,8 @@ import de.anomic.search.RankingProcess;
import de.anomic.yacy.yacyURL;
public class ReferenceOrder {
private WordReferenceVars min, max;
private final RankingProfile ranking;
private WordReferenceVars min, max;
private final RankingProfile ranking;
private final ScoreCluster<String> doms; // collected for "authority" heuristic
private int maxdomcount;
private String language;
@ -58,8 +58,75 @@ public class ReferenceOrder {
this.language = language;
}
public class Normalizer extends Thread {
private ReferenceContainer<WordReference> container;
private BlockingQueue<WordReferenceVars> decodedEntries;
public Normalizer(final ReferenceContainer<WordReference> container) {
// normalize ranking: find minimum and maximum of separate ranking criteria
assert (container != null);
this.container = container;
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
}
public void run() {
BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(container);
WordReferenceVars entryMin = null;
WordReferenceVars entryMax = null;
HashMap<String, Integer> doms0 = new HashMap<String, Integer>();
Integer int1 = 1;
WordReferenceVars iEntry;
String dom;
Integer count;
try {
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
decodedEntries.put(iEntry);
// find min/max
if (entryMin == null) entryMin = iEntry.clone(); else entryMin.min(iEntry);
if (entryMax == null) entryMax = iEntry.clone(); else entryMax.max(iEntry);
// update domcount
dom = iEntry.metadataHash().substring(6);
count = doms0.get(dom);
if (count == null) {
doms0.put(dom, int1);
} else {
doms0.put(dom, Integer.valueOf(count.intValue() + 1));
}
}
} catch (InterruptedException e) {}
if (min == null) min = entryMin.clone(); else min.min(entryMin);
if (max == null) max = entryMax.clone(); else max.max(entryMax);
Map.Entry<String, Integer> entry;
final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
while (di.hasNext()) {
entry = di.next();
doms.addScore(entry.getKey(), (entry.getValue()).intValue());
}
if (doms.size() > 0) maxdomcount = doms.getMaxScore();
try {
decodedEntries.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
}
public BlockingQueue<WordReferenceVars> decoded() {
return this.decodedEntries;
}
}
public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
Normalizer n = new Normalizer(container);
n.start();
return n.decoded();
}
/*
public ArrayList<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
// normalize ranking: find minimum and maxiumum of separate ranking criteria
// normalize ranking: find minimum and maximum of separate ranking criteria
assert (container != null);
BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(container);
@ -101,6 +168,7 @@ public class ReferenceOrder {
if (this.doms.size() > 0) this.maxdomcount = this.doms.getMaxScore();
return decodedEntries;
}
*/
public int authority(final String urlHash) {
return (doms.getScore(urlHash.substring(6)) << 8) / (1 + this.maxdomcount);
@ -154,42 +222,4 @@ public class ReferenceOrder {
// this is to patch a bad language name setting that was used in 0.60 and before
if (l.equals("uk")) return "en"; else return l;
}
public static class minmaxfinder {
private WordReferenceVars entryMin;
private WordReferenceVars entryMax;
private final HashMap<String, Integer> doms;
private final Integer int1;
private final ArrayList<WordReferenceVars> decodedEntries;
public minmaxfinder(final BlockingQueue<WordReferenceVars> vars) {
this.doms = new HashMap<String, Integer>();
this.int1 = 1;
this.decodedEntries = new ArrayList<WordReferenceVars>();
this.entryMin = null;
this.entryMax = null;
WordReferenceVars iEntry;
String dom;
Integer count;
try {
while ((iEntry = vars.take()) != WordReferenceVars.poison) {
this.decodedEntries.add(iEntry);
// find min/max
if (this.entryMin == null) this.entryMin = iEntry.clone(); else this.entryMin.min(iEntry);
if (this.entryMax == null) this.entryMax = iEntry.clone(); else this.entryMax.max(iEntry);
// update domcount
dom = iEntry.metadataHash().substring(6);
count = doms.get(dom);
if (count == null) {
doms.put(dom, int1);
} else {
doms.put(dom, Integer.valueOf(count.intValue() + 1));
}
}
} catch (InterruptedException e) {}
}
}
}

@ -37,6 +37,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.document.Condenser;
@ -165,83 +166,96 @@ public final class RankingProcess extends Thread {
long timer = System.currentTimeMillis();
// normalize entries
final ArrayList<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
final BlockingQueue<WordReferenceVars> decodedEntries = this.order.normalizeWith(index);
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.NORMALIZING, index.size(), System.currentTimeMillis() - timer), false);
// iterate over normalized entries and select some that are better than currently stored
timer = System.currentTimeMillis();
Long r;
HostInfo hs;
String domhash;
boolean nav_hosts = this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts") >= 0;
for (WordReferenceVars iEntry: decodedEntries) {
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
WordReferenceVars iEntry;
final ArrayList<WordReferenceVars> filteredEntries = new ArrayList<WordReferenceVars>();
// apply all filter
try {
while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
//if (iEntry.urlHash().length() != index.row().primaryKeyLength) continue;
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
}
// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(iEntry));
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
// check constraints
if (!testFlags(iEntry)) continue;
// check document domain
if (query.contentdom != QueryParams.CONTENTDOM_TEXT) {
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}
// increase flag counts
for (int j = 0; j < 32; j++) {
if (iEntry.flags().get(j)) {flagcount[j]++;}
}
// check constraints
if (!testFlags(iEntry)) continue;
// check document domain
if (query.contentdom != QueryParams.CONTENTDOM_TEXT) {
if ((query.contentdom == QueryParams.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(Condenser.flag_cat_hasaudio)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(Condenser.flag_cat_hasvideo)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(Condenser.flag_cat_hasimage)))) continue;
if ((query.contentdom == QueryParams.CONTENTDOM_APP ) && (!(iEntry.flags().get(Condenser.flag_cat_hasapp )))) continue;
}
// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}
// check site constraints
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}
// count domZones
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
// get statistics for host navigator
if (nav_hosts) {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
}
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
}
// double-check
if (urlhashes.containsKey(iEntry.metadataHash())) continue;
stack.push(iEntry, r);
}
// increase counter for statistics
if (!local) this.remote_indexCount++;
}
// check tld domain
if (!yacyURL.matchesAnyDomDomain(iEntry.metadataHash(), this.query.zonecode)) {
// filter out all tld that do not match with wanted tld domain
continue;
}
// check site constraints
if (query.sitehash != null && !iEntry.metadataHash().substring(6).equals(query.sitehash)) {
// filter out all domains that do not match with the site constraint
continue;
}
// count domZones
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
// get statistics for host navigator
if (nav_hosts) {
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new HostInfo(iEntry.urlHash));
} else {
hs.inc();
}
}
// accept
filteredEntries.add(iEntry);
// increase counter for statistics
if (!local) this.remote_indexCount++;
}
} catch (InterruptedException e) {}
// do the ranking
Long r;
for (WordReferenceVars fEntry: filteredEntries) {
// kick out entries that are too bad according to current findings
r = Long.valueOf(order.cardinal(fEntry));
if ((maxentries >= 0) && (stack.size() >= maxentries) && (stack.bottom(r.longValue()))) continue;
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
if (urlhashes.containsKey(fEntry.metadataHash())) continue;
stack.push(fEntry, r);
} else {
// if we already have enough entries, insert only such that are necessary to get a better result
if (stack.bottom(r.longValue())) {
continue;
}
// double-check
if (urlhashes.containsKey(fEntry.metadataHash())) continue;
stack.push(fEntry, r);
}
}
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new ProfilingGraph.searchEvent(query.id(true), SearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);

Loading…
Cancel
Save