renamed topwords to topics and enhanced computation methods of topics

topics will now only be computed using the document title, not the document url,
because the host navigator is now responsible for statistical effects of urls.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6011 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 61d9e131b4
commit ab06a6edd2

@ -32,7 +32,6 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.content.RSSMessage; import de.anomic.content.RSSMessage;
@ -49,6 +48,7 @@ import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchEvent.ResultEntry; import de.anomic.plasma.plasmaSearchEvent.ResultEntry;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
@ -321,11 +321,10 @@ public final class search {
// prepare reference hints // prepare reference hints
final long timer = System.currentTimeMillis(); final long timer = System.currentTimeMillis();
final Set<String> ws = theSearch.references(10); final ArrayList<NavigatorEntry> ws = theSearch.topics(10);
final StringBuilder refstr = new StringBuilder(); final StringBuilder refstr = new StringBuilder();
final Iterator<String> j = ws.iterator(); for (NavigatorEntry e: ws) {
while (j.hasNext()) { refstr.append(",").append(e.name);
refstr.append(",").append(j.next());
} }
prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString()); prop.put("references", (refstr.length() > 0) ? refstr.substring(1) : refstr.toString());
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), "reference collection", ws.size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(theQuery.id(true), "reference collection", ws.size(), System.currentTimeMillis() - timer), false);

@ -26,7 +26,6 @@
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator; import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.http.httpRequestHeader; import de.anomic.http.httpRequestHeader;
@ -36,7 +35,7 @@ import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -65,17 +64,17 @@ public class yacysearchtrailer {
// compose search navigation // compose search navigation
ArrayList<hostnaventry> hostNavigator = theSearch.getHostNavigator(10); ArrayList<NavigatorEntry> hostNavigator = theSearch.getHostNavigator(10);
if (hostNavigator == null) { if (hostNavigator == null) {
prop.put("navigation", 0); prop.put("navigation", 0);
} else { } else {
prop.put("navigation", 1); prop.put("navigation", 1);
hostnaventry entry; NavigatorEntry entry;
int i; int i;
for (i = 0; i < hostNavigator.size(); i++) { for (i = 0; i < hostNavigator.size(); i++) {
entry = hostNavigator.get(i); entry = hostNavigator.get(i);
prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>"); prop.put("navigation_domains_" + i + "_domain", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")</a>");
prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>"); prop.putJSON("navigation_domains_" + i + "_domain-json", plasmaSearchQuery.navurla(0, display, theQuery, theQuery.urlMask, "site:" + entry.name) + entry.name + " (" + entry.count + ")</a>");
prop.put("navigation_domains_" + i + "_nl", 1); prop.put("navigation_domains_" + i + "_nl", 1);
} }
i--; i--;
@ -84,16 +83,13 @@ public class yacysearchtrailer {
} }
// attach the bottom line with search references (topwords) // attach the bottom line with search references (topwords)
final Set<String> references = theSearch.references(20); final ArrayList<NavigatorEntry> references = theSearch.topics(20);
if (references.size() > 0) { if (references.size() > 0) {
// get the topwords // get the topwords
final TreeSet<String> topwords = new TreeSet<String>(NaturalOrder.naturalComparator); final TreeSet<String> topwords = new TreeSet<String>(NaturalOrder.naturalComparator);
String tmp = ""; for (NavigatorEntry e: references) {
final Iterator<String> i = references.iterator(); if (e.name.matches("[a-z]+")) {
while (i.hasNext()) { topwords.add(e.name);
tmp = i.next();
if (tmp.matches("[a-z]+")) {
topwords.add(tmp);
} }
} }

@ -32,7 +32,6 @@ import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.Set;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
@ -53,7 +52,7 @@ import de.anomic.kelondro.util.SortStore;
import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Word; import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser; import de.anomic.plasma.parser.Condenser;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry; import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet; import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling; import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySearch;
@ -99,7 +98,7 @@ public final class plasmaSearchEvent {
long urlRetrievalAllTime; long urlRetrievalAllTime;
long snippetComputationAllTime; long snippetComputationAllTime;
public ResultURLs crawlResults; public ResultURLs crawlResults;
private ArrayList<hostnaventry> hostNavigator; private ArrayList<NavigatorEntry> hostNavigator;
@SuppressWarnings("unchecked") @SuppressWarnings("unchecked")
private plasmaSearchEvent(final plasmaSearchQuery query, private plasmaSearchEvent(final plasmaSearchQuery query,
@ -559,7 +558,7 @@ public final class plasmaSearchEvent {
// place the result to the result vector // place the result to the result vector
if (!result.exists(resultEntry)) { if (!result.exists(resultEntry)) {
result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word()))); result.push(resultEntry, Long.valueOf(rankedCache.getOrder().cardinal(resultEntry.word())));
rankedCache.addReferences(resultEntry); rankedCache.addTopics(resultEntry);
} }
//System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url()); //System.out.println("DEBUG SNIPPET_LOADING: thread " + id + " got " + resultEntry.url());
} }
@ -579,7 +578,7 @@ public final class plasmaSearchEvent {
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
} }
public ArrayList<hostnaventry> getHostNavigator(int maxentries) { public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
if (this.hostNavigator != null) return this.hostNavigator; if (this.hostNavigator != null) return this.hostNavigator;
if (localSearchThread != null && localSearchThread.isAlive()) { if (localSearchThread != null && localSearchThread.isAlive()) {
try {Thread.sleep(100L);} catch (final InterruptedException e) {} try {Thread.sleep(100L);} catch (final InterruptedException e) {}
@ -778,9 +777,9 @@ public final class plasmaSearchEvent {
//assert e != null; //assert e != null;
} }
public Set<String> references(final int count) { public ArrayList<NavigatorEntry> topics(final int count) {
// returns a set of words that are computed as toplist // returns a set of words that are computed as toplist
return this.rankedCache.getReferences(count); return this.rankedCache.getTopicNavigator(count);
} }
public static class ResultEntry { public static class ResultEntry {

@ -48,7 +48,6 @@ import de.anomic.kelondro.text.Segment;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.text.referencePrototype.WordReference;
import de.anomic.kelondro.text.referencePrototype.WordReferenceVars; import de.anomic.kelondro.text.referencePrototype.WordReferenceVars;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.SortStack; import de.anomic.kelondro.util.SortStack;
import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Word; import de.anomic.plasma.parser.Word;
@ -71,13 +70,13 @@ public final class plasmaSearchRankingProcess {
private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize; private int remote_peerCount, remote_indexCount, remote_resourceSize, local_resourceSize;
private final ReferenceOrder order; private final ReferenceOrder order;
private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion) private final ConcurrentHashMap<String, Integer> urlhashes; // map for double-check; String/Long relation, addresses ranking number (backreference for deletion)
private final ScoreCluster<String> ref; // reference score computation for the commonSense heuristic
private final int[] flagcount; // flag counter private final int[] flagcount; // flag counter
private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB private final TreeSet<String> misses; // contains url-hashes that could not been found in the LURL-DB
private final Segment indexSegment; private final Segment indexSegment;
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps; private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
private final int[] domZones; private final int[] domZones;
private ConcurrentHashMap<String, hoststat> hostNavigator; private final ConcurrentHashMap<String, hoststat> hostNavigator;
private final ConcurrentHashMap<String, Integer> ref; // reference score computation for the commonSense heuristic
public plasmaSearchRankingProcess( public plasmaSearchRankingProcess(
final Segment indexSegment, final Segment indexSegment,
@ -99,13 +98,13 @@ public final class plasmaSearchRankingProcess {
this.remote_resourceSize = 0; this.remote_resourceSize = 0;
this.local_resourceSize = 0; this.local_resourceSize = 0;
this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency); this.urlhashes = new ConcurrentHashMap<String, Integer>(0, 0.75f, concurrency);
this.ref = new ScoreCluster<String>();
this.misses = new TreeSet<String>(); this.misses = new TreeSet<String>();
this.indexSegment = indexSegment; this.indexSegment = indexSegment;
this.flagcount = new int[32]; this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;} for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.domZones = new int[8];
this.hostNavigator = new ConcurrentHashMap<String, hoststat>(); this.hostNavigator = new ConcurrentHashMap<String, hoststat>();
this.ref = new ConcurrentHashMap<String, Integer>();
this.domZones = new int[8];
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;} for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
} }
@ -232,52 +231,6 @@ public final class plasmaSearchRankingProcess {
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false); serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
} }
public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};
public class hostnaventry {
public int count;
public String host;
public hostnaventry(String host, int count) {
this.host = host;
this.count = count;
}
}
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(maxentries, hsa.length);
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new hostnaventry(url.getHost(), hsa[i].count));
}
return result;
}
private boolean testFlags(final WordReference ientry) { private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true; if (query.constraint == null) return true;
// test if ientry matches with filter // test if ientry matches with filter
@ -424,37 +377,103 @@ public final class plasmaSearchRankingProcess {
return this.misses.iterator(); return this.misses.iterator();
} }
public Set<String> getReferences(final int count) { public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}
public static final Comparator<hoststat> hscomp = new Comparator<hoststat>() {
public int compare(hoststat o1, hoststat o2) {
if (o1.count < o2.count) return 1;
if (o2.count < o1.count) return -1;
return 0;
}
};
public class NavigatorEntry {
public int count;
public String name;
public NavigatorEntry(String name, int count) {
this.name = name;
this.count = count;
}
}
public ArrayList<NavigatorEntry> getHostNavigator(int count) {
hoststat[] hsa = this.hostNavigator.values().toArray(new hoststat[this.hostNavigator.size()]);
Arrays.sort(hsa, hscomp);
int rc = Math.min(count, hsa.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
mr = indexSegment.urlMetadata().load(hsa[i].hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new NavigatorEntry(url.getHost(), hsa[i].count));
}
return result;
}
public static final Comparator<Map.Entry<String, Integer>> mecomp = new Comparator<Map.Entry<String, Integer>>() {
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
if (o1.getValue().intValue() < o2.getValue().intValue()) return 1;
if (o2.getValue().intValue() < o1.getValue().intValue()) return -1;
return 0;
}
};
@SuppressWarnings("unchecked")
public ArrayList<NavigatorEntry> getTopicNavigator(final int count) {
// create a list of words that had been computed by statistics over all // create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls // words that appeared in the url or the description of all urls
final Object[] refs = ref.getScores(count, false, 2, Integer.MAX_VALUE);
final TreeSet<String> s = new TreeSet<String>(String.CASE_INSENSITIVE_ORDER); Map.Entry<String, Integer>[] a = this.ref.entrySet().toArray(new Map.Entry[this.ref.size()]);
for (int i = 0; i < refs.length; i++) { Arrays.sort(a, mecomp);
s.add((String) refs[i]); int rc = Math.min(count, a.length);
ArrayList<NavigatorEntry> result = new ArrayList<NavigatorEntry>();
Map.Entry<String, Integer> e;
int c;
for (int i = 0; i < rc; i++) {
e = a[i];
c = e.getValue().intValue();
if (c == 0) break;
result.add(new NavigatorEntry(e.getKey(), c));
} }
return s; return result;
} }
public void addReferences(final String[] words) { public void addTopic(final String[] words) {
String word; String word;
for (int i = 0; i < words.length; i++) { for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase(); word = words[i].toLowerCase();
Integer c;
if ((word.length() > 2) && if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(Word.word2hash(word))))) (!(query.queryHashes.contains(Word.word2hash(word))))) {
ref.incScore(word); c = ref.get(word);
if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1);
}
} }
} }
protected void addReferences(final plasmaSearchEvent.ResultEntry resultEntry) { protected void addTopics(final plasmaSearchEvent.ResultEntry resultEntry) {
// take out relevant information for reference computation // take out relevant information for reference computation
if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; if ((resultEntry.url() == null) || (resultEntry.title() == null)) return;
final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url //final String[] urlcomps = htmlFilterContentScraper.urlComps(resultEntry.url().toNormalform(true, true)); // word components of the url
final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description final String[] descrcomps = resultEntry.title().toLowerCase().split(htmlFilterContentScraper.splitrex); // words in the description
// add references // add references
addReferences(urlcomps); //addTopic(urlcomps);
addReferences(descrcomps); addTopic(descrcomps);
} }
public ReferenceOrder getOrder() { public ReferenceOrder getOrder() {

@ -604,8 +604,8 @@ public final class yacyClient {
yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references); yacyCore.log.logInfo("remote search (client): peer " + target.getName() + " sent references " + references);
if (references != null) { if (references != null) {
// add references twice, so they can be countet (must have at least 2 entries) // add references twice, so they can be countet (must have at least 2 entries)
containerCache.addReferences(references.split(",")); containerCache.addTopic(references.split(","));
containerCache.addReferences(references.split(",")); containerCache.addTopic(references.split(","));
} }
} }

Loading…
Cancel
Save