diff --git a/ChangeLog b/ChangeLog index 59ac638a8..876be2172 100644 --- a/ChangeLog +++ b/ChangeLog @@ -5,6 +5,7 @@ version 0.44svn * ADDED: Show public Bookmarks in Bookmarks.html, private ones, if the user is logged in. (Allo) * FIXED: /xml/bookmarks/* now uses one file for private/public entries. private only with password. * ADDED: possibility to get the ranking for a url. (Allo) + * ADDED: yacy.badwords list to filter the topwords. (Borg-0300) version 0.43 * UPDATED: new database handling of index entry objects, less IO overhead (Orbiter) @@ -19,7 +20,7 @@ version 0.43 * UPDATED: Memorysettings now also working for Windows (Galaxis, VT100) * UPDATED: added more informations to network.xml, added possibility to limit number of results (hendi) * FIXED: some Filemodes were set wrong (theli) - * FIXED: minor bug-fix in Cache for some rare URLs (borg-0300) + * FIXED: minor bug-fix in Cache for some rare URLs (Borg-0300) * ADDED: The YaCy-Bookmarkssystem (allo) * UPDATED: New Searchstyle * FIXED: notifier.gif works even when only DATA is writable (allo) diff --git a/htroot/index.java b/htroot/index.java index 88036be04..bed7edb71 100644 --- a/htroot/index.java +++ b/htroot/index.java @@ -50,10 +50,12 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashMap; +import java.util.Iterator; import java.util.TreeSet; import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSwitchboard; @@ -67,6 +69,8 @@ import de.anomic.yacy.yacyCore; public class index { + public static final int MAX_TOPWORDS = 16; + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { final plasmaSwitchboard sb = (plasmaSwitchboard) env; @@ -223,20 +227,35 @@ public class index { prop.put("num-results_totalcount", totalcount); int hintcount = references.length; if (hintcount > 0) { - if (hintcount > 16) { hintcount = 16; } + prop.put("combine", 1); - String word; + + final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder); for (int i = 0; i < hintcount; i++) { - word = (String) references[i]; + topwords.add(references[i]); + } + + // filter out the badwords + final TreeSet filteredtopwords = kelondroMSetTools.joinConstructive(topwords, plasmaSwitchboard.badwords); + if (filteredtopwords.size() > 0) { + kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords); + } + + String word; + hintcount = 0; + final Iterator iter = topwords.iterator(); + while (iter.hasNext()) { + word = (String) iter.next(); if (word != null) { - prop.put("combine_words_" + i + "_word", word); - prop.put("combine_words_" + i + "_newsearch", post.get("search", "").replace(' ', '+') + "+" + word); - prop.put("combine_words_" + i + "_count", count); - prop.put("combine_words_" + i + "_order", order); - prop.put("combine_words_" + i + "_resource", ((global) ? "global" : "local")); - prop.put("combine_words_" + i + "_time", (searchtime / 1000)); + prop.put("combine_words_" + hintcount + "_word", word); + prop.put("combine_words_" + hintcount + "_newsearch", post.get("search", "").replace(' ', '+') + "+" + word); + prop.put("combine_words_" + hintcount + "_count", count); + prop.put("combine_words_" + hintcount + "_order", order); + prop.put("combine_words_" + hintcount + "_resource", ((global) ? "global" : "local")); + prop.put("combine_words_" + hintcount + "_time", (searchtime / 1000)); } - prop.put("combine_words", i); + prop.put("combine_words", hintcount); + if (hintcount++ > MAX_TOPWORDS) { break; } } } } else { @@ -312,6 +331,4 @@ public class index { return prop; } - - } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index c31c27e3e..5878c7f13 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -161,8 +161,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public static int maxCRGDump = 200000; // couloured list management + public static TreeSet badwords = null; public static TreeSet blueList = null; - public static TreeSet stopwords = null; + public static TreeSet stopwords = null; public static plasmaURLPattern urlBlacklist; // storage management @@ -283,7 +284,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser urlBlacklist.size() + " entries, " + ppRamString(ulrBlackListFile.length()/1024)); } - + + // load badwords (to filter the topwords) + if (badwords == null) { + File badwordsFile = new File(rootPath, "yacy.badwords"); + badwords = kelondroMSetTools.loadList(badwordsFile, kelondroNaturalOrder.naturalOrder); + this.log.logConfig("loaded badwords from file " + badwordsFile.getName() + + ", " + badwords.size() + " entries, " + + ppRamString(badwordsFile.length()/1024)); + } + // load stopwords if (stopwords == null) { File stopwordsFile = new File(rootPath, "yacy.stopwords"); @@ -292,7 +302,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser stopwords.size() + " entries, " + ppRamString(stopwordsFile.length()/1024)); } - + // load ranking tables File YBRPath = new File(rootPath, "ranking/YBR"); if (YBRPath.exists()) { diff --git a/yacy.badwords b/yacy.badwords new file mode 100644 index 000000000..a9b8db7de --- /dev/null +++ b/yacy.badwords @@ -0,0 +1,29 @@ +ads +ads43 +dir +jhtml +shtml +nach +auf +mit +zum +bei +ueber +all +app +ist +eine +auch +aber +und +der +die +das +dass +oder +from +kein +weg +site=all +referrer=default +>>>