ADDED: yacy.badwords list to filter the topwords

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1711 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 19 years ago
parent d7da273d7e
commit 64441b1f78

@ -5,6 +5,7 @@ version 0.44svn
* ADDED: Show public Bookmarks in Bookmarks.html, private ones, if the user is logged in. (Allo) * ADDED: Show public Bookmarks in Bookmarks.html, private ones, if the user is logged in. (Allo)
* FIXED: /xml/bookmarks/* now uses one file for private/public entries. private only with password. * FIXED: /xml/bookmarks/* now uses one file for private/public entries. private only with password.
* ADDED: possibility to get the ranking for a url. (Allo) * ADDED: possibility to get the ranking for a url. (Allo)
* ADDED: yacy.badwords list to filter the topwords. (Borg-0300)
version 0.43 version 0.43
* UPDATED: new database handling of index entry objects, less IO overhead (Orbiter) * UPDATED: new database handling of index entry objects, less IO overhead (Orbiter)
@ -19,7 +20,7 @@ version 0.43
* UPDATED: Memorysettings now also working for Windows (Galaxis, VT100) * UPDATED: Memorysettings now also working for Windows (Galaxis, VT100)
* UPDATED: added more informations to network.xml, added possibility to limit number of results (hendi) * UPDATED: added more informations to network.xml, added possibility to limit number of results (hendi)
* FIXED: some Filemodes were set wrong (theli) * FIXED: some Filemodes were set wrong (theli)
* FIXED: minor bug-fix in Cache for some rare URLs (borg-0300) * FIXED: minor bug-fix in Cache for some rare URLs (Borg-0300)
* ADDED: The YaCy-Bookmarkssystem (allo) * ADDED: The YaCy-Bookmarkssystem (allo)
* UPDATED: New Searchstyle * UPDATED: New Searchstyle
* FIXED: notifier.gif works even when only DATA is writable (allo) * FIXED: notifier.gif works even when only DATA is writable (allo)

@ -50,10 +50,12 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -67,6 +69,8 @@ import de.anomic.yacy.yacyCore;
public class index { public class index {
public static final int MAX_TOPWORDS = 16;
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final plasmaSwitchboard sb = (plasmaSwitchboard) env; final plasmaSwitchboard sb = (plasmaSwitchboard) env;
@ -223,20 +227,35 @@ public class index {
prop.put("num-results_totalcount", totalcount); prop.put("num-results_totalcount", totalcount);
int hintcount = references.length; int hintcount = references.length;
if (hintcount > 0) { if (hintcount > 0) {
if (hintcount > 16) { hintcount = 16; }
prop.put("combine", 1); prop.put("combine", 1);
String word;
final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder);
for (int i = 0; i < hintcount; i++) { for (int i = 0; i < hintcount; i++) {
word = (String) references[i]; topwords.add(references[i]);
}
// filter out the badwords
final TreeSet filteredtopwords = kelondroMSetTools.joinConstructive(topwords, plasmaSwitchboard.badwords);
if (filteredtopwords.size() > 0) {
kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords);
}
String word;
hintcount = 0;
final Iterator iter = topwords.iterator();
while (iter.hasNext()) {
word = (String) iter.next();
if (word != null) { if (word != null) {
prop.put("combine_words_" + i + "_word", word); prop.put("combine_words_" + hintcount + "_word", word);
prop.put("combine_words_" + i + "_newsearch", post.get("search", "").replace(' ', '+') + "+" + word); prop.put("combine_words_" + hintcount + "_newsearch", post.get("search", "").replace(' ', '+') + "+" + word);
prop.put("combine_words_" + i + "_count", count); prop.put("combine_words_" + hintcount + "_count", count);
prop.put("combine_words_" + i + "_order", order); prop.put("combine_words_" + hintcount + "_order", order);
prop.put("combine_words_" + i + "_resource", ((global) ? "global" : "local")); prop.put("combine_words_" + hintcount + "_resource", ((global) ? "global" : "local"));
prop.put("combine_words_" + i + "_time", (searchtime / 1000)); prop.put("combine_words_" + hintcount + "_time", (searchtime / 1000));
} }
prop.put("combine_words", i); prop.put("combine_words", hintcount);
if (hintcount++ > MAX_TOPWORDS) { break; }
} }
} }
} else { } else {
@ -312,6 +331,4 @@ public class index {
return prop; return prop;
} }
} }

@ -161,8 +161,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static int maxCRGDump = 200000; public static int maxCRGDump = 200000;
// couloured list management // couloured list management
public static TreeSet badwords = null;
public static TreeSet blueList = null; public static TreeSet blueList = null;
public static TreeSet stopwords = null; public static TreeSet stopwords = null;
public static plasmaURLPattern urlBlacklist; public static plasmaURLPattern urlBlacklist;
// storage management // storage management
@ -283,7 +284,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
urlBlacklist.size() + " entries, " + urlBlacklist.size() + " entries, " +
ppRamString(ulrBlackListFile.length()/1024)); ppRamString(ulrBlackListFile.length()/1024));
} }
// load badwords (to filter the topwords)
if (badwords == null) {
File badwordsFile = new File(rootPath, "yacy.badwords");
badwords = kelondroMSetTools.loadList(badwordsFile, kelondroNaturalOrder.naturalOrder);
this.log.logConfig("loaded badwords from file " + badwordsFile.getName() +
", " + badwords.size() + " entries, " +
ppRamString(badwordsFile.length()/1024));
}
// load stopwords // load stopwords
if (stopwords == null) { if (stopwords == null) {
File stopwordsFile = new File(rootPath, "yacy.stopwords"); File stopwordsFile = new File(rootPath, "yacy.stopwords");
@ -292,7 +302,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
stopwords.size() + " entries, " + stopwords.size() + " entries, " +
ppRamString(stopwordsFile.length()/1024)); ppRamString(stopwordsFile.length()/1024));
} }
// load ranking tables // load ranking tables
File YBRPath = new File(rootPath, "ranking/YBR"); File YBRPath = new File(rootPath, "ranking/YBR");
if (YBRPath.exists()) { if (YBRPath.exists()) {

@ -0,0 +1,29 @@
ads
ads43
dir
jhtml
shtml
nach
auf
mit
zum
bei
ueber
all
app
ist
eine
auch
aber
und
der
die
das
dass
oder
from
kein
weg
site=all
referrer=default
>>>
Loading…
Cancel
Save