some refactoring of topic generation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6018 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent f28f62fb21
commit 15fad767c0

@ -321,7 +321,7 @@ public final class search {
// prepare reference hints
final long timer = System.currentTimeMillis();
final ArrayList<NavigatorEntry> ws = theSearch.topics(10);
final ArrayList<NavigatorEntry> ws = theSearch.getTopicNavigator(10);
final StringBuilder refstr = new StringBuilder();
for (NavigatorEntry e: ws) {
refstr.append(",").append(e.name);

@ -26,15 +26,11 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.util.SetTools;
import de.anomic.plasma.plasmaProfiling;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSearchRankingProcess.NavigatorEntry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverProfiling;
@ -83,38 +79,17 @@ public class yacysearchtrailer {
}
// attach the bottom line with search references (topwords)
final ArrayList<NavigatorEntry> references = theSearch.topics(20);
final ArrayList<NavigatorEntry> references = theSearch.getTopicNavigator(10);
if (references.size() > 0) {
// get the topwords
final TreeSet<String> topwords = new TreeSet<String>(NaturalOrder.naturalComparator);
for (NavigatorEntry e: references) {
if (e.name.matches("[a-z]+")) {
topwords.add(e.name);
}
}
// filter out the badwords
final TreeSet<String> filteredtopwords = SetTools.joinConstructive(topwords, plasmaSwitchboard.badwords);
if (filteredtopwords.size() > 0) {
SetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords);
}
// avoid stopwords being topwords
if (env.getConfig("filterOutStopwordsFromTopwords", "true").equals("true")) {
if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) {
SetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords);
}
}
String word;
int hintcount = 0;
final Iterator<String> iter = topwords.iterator();
NavigatorEntry e;
Iterator<NavigatorEntry> iter = references.iterator();
while (iter.hasNext()) {
word = iter.next();
e = iter.next();
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (word != null) {
prop.putHTML("words_" + hintcount + "_word", word);
prop.putHTML("words_" + hintcount + "_newsearch", theQuery.queryString.replace(' ', '+') + "+" + word);
if (e.name != null) {
prop.putHTML("words_" + hintcount + "_word", e.name);
prop.putHTML("words_" + hintcount + "_newsearch", theQuery.queryString.replace(' ', '+') + "+" + e.name);
prop.put("words_" + hintcount + "_count", theQuery.displayResults());
prop.put("words_" + hintcount + "_offset", "0");
prop.put("words_" + hintcount + "_display", display);

@ -98,7 +98,6 @@ public final class plasmaSearchEvent {
long urlRetrievalAllTime;
long snippetComputationAllTime;
public ResultURLs crawlResults;
private ArrayList<NavigatorEntry> hostNavigator;
@SuppressWarnings("unchecked")
private plasmaSearchEvent(final plasmaSearchQuery query,
@ -124,7 +123,6 @@ public final class plasmaSearchEvent {
this.snippetComputationAllTime = 0;
this.workerThreads = null;
this.localSearchThread = null;
this.hostNavigator = null;
this.result = new SortStore<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
this.images = new SortStore<plasmaSnippetCache.MediaSnippet>(-1);
this.failedURLs = new HashMap<String, String>(); // a map of urls to reason strings where a worker thread tried to work on, but failed.
@ -578,13 +576,12 @@ public final class plasmaSearchEvent {
}
public ArrayList<NavigatorEntry> getHostNavigator(int maxentries) {
if (this.hostNavigator != null) return this.hostNavigator;
if (localSearchThread != null && localSearchThread.isAlive()) {
try {Thread.sleep(100L);} catch (final InterruptedException e) {}
}
this.hostNavigator = rankedCache.getHostNavigator(10);
if (this.hostNavigator.size() == 0) this.hostNavigator = null;
return this.hostNavigator;
return this.rankedCache.getHostNavigator(maxentries);
}
public ArrayList<NavigatorEntry> getTopicNavigator(final int maxentries) {
// returns a set of words that are computed as toplist
return this.rankedCache.getTopicNavigator(maxentries);
}
public ResultEntry oneResult(final int item) {
@ -730,6 +727,7 @@ public final class plasmaSearchEvent {
if (peer.equals(mypeerhash)) continue; // we dont need to ask ourself
urls = entry1.getValue();
words = wordsFromPeer(peer, urls);
assert words.length() >= 12 : "words = " + words;
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " has urls: " + urls);
//System.out.println("DEBUG-INDEXABSTRACT ***: peer " + peer + " from words: " + words);
secondarySearchThreads[c++] = yacySearch.secondaryRemoteSearch(
@ -776,11 +774,6 @@ public final class plasmaSearchEvent {
//assert e != null;
}
public ArrayList<NavigatorEntry> topics(final int count) {
// returns a set of words that are computed as toplist
return this.rankedCache.getTopicNavigator(count);
}
public static class ResultEntry {
// payload objects
private final URLMetadataRow urlentry;

@ -459,9 +459,12 @@ public final class plasmaSearchRankingProcess {
for (int i = 0; i < words.length; i++) {
word = words[i].toLowerCase();
Integer c;
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(Word.word2hash(word))))) {
if (word.length() > 2 &&
"http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0 &&
!query.queryHashes.contains(Word.word2hash(word)) &&
word.matches("[a-z]+") &&
!plasmaSwitchboard.badwords.contains(word) &&
!plasmaSwitchboard.stopwords.contains(word)) {
c = ref.get(word);
if (c == null) ref.put(word, 1); else ref.put(word, c.intValue() + 1);
}

@ -289,7 +289,7 @@ public class yacySearch extends Thread {
final String targethash, final Blacklist blacklist,
final plasmaSearchRankingProfile rankingProfile,
final Bitfield constraint, final TreeMap<byte[], String> clusterselection) {
assert wordhashes.length() >= 12;
assert wordhashes.length() >= 12 : "wordhashes = " + wordhashes;
// check own peer status
if (peers.mySeed() == null || peers.mySeed().getPublicAddress() == null) { return null; }

Loading…
Cancel
Save