- fix stopword handling for RWI see example http://bugs.yacy.net/view.php?id=247

- append language setting specific stopword list

- remove unused OVERHANG stack type
pull/1/head
reger 12 years ago
parent 5c7ddc67fe
commit 7480e87386

@ -525,9 +525,9 @@ public class yacysearch {
final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllHashes().size() - 1 : Integer.MAX_VALUE;
// filter out stopwords
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords);
final SortedSet<String> filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords
if ( !filtered.isEmpty() ) {
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), Switchboard.stopwords);
SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords
}
// if a minus-button was hit, remove a special reference first

@ -24,6 +24,7 @@ import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.CloneableIterator;
@ -88,7 +89,8 @@ public interface HandleSet extends Iterable<byte[]>, Cloneable, Serializable {
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey);
public void excludeDestructive(final HandleSet other);
// public void excludeDestructive(final HandleSet other);
public void excludeDestructive(final Set<byte[]> other); // used for stopwordhashes etc.
@Override
public Iterator<byte[]> iterator();

@ -256,8 +256,7 @@ public class CrawlQueues {
final String stats = "LOCALCRAWL[" +
this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " +
this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) +
this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) +
", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) {
@ -585,7 +584,7 @@ public class CrawlQueues {
}
// we don't want to crawl a global URL globally, since WE are the global part. (from this point of view)
final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", "
final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", "
+ this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]";
try {
final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots);

@ -48,7 +48,7 @@ import net.yacy.kelondro.logging.Log;
public class NoticedURL {
public enum StackType {
LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD;
LOCAL, GLOBAL, REMOTE, NOLOAD;
}
private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain
@ -152,7 +152,6 @@ public class NoticedURL {
case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size();
case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size();
case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size();
case OVERHANG: return 0;
case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size();
default: return -1;
}

@ -36,6 +36,7 @@ import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Iterator;
import java.util.Set;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.ByteOrder;
@ -329,10 +330,17 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
}
@Override
public void excludeDestructive(final HandleSet other) {
excludeDestructive(this, other);
}
public void excludeDestructive (final Set<byte[]> other) {
if (other == null) return;
if (other.isEmpty()) return;
if (other.size() > this.size()) {
for (byte[] b: this) {if (other.contains(b)) this.remove(b);}
} else {
for (byte[] b: other) {this.remove(b) ;}
}
}
/* not used 2013-06-06
private static void excludeDestructive(final HandleSet set1, final HandleSet set2) {
if (set1 == null) return;
if (set2 == null) return;
@ -354,7 +362,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
final Iterator<byte[]> si = small.iterator();
while (si.hasNext()) large.remove(si.next());
}
*/
public static void main(String[] args) {
HandleSet s = new RowHandleSet(8, NaturalOrder.naturalOrder, 100);
try {

@ -527,10 +527,11 @@ public final class SetTools {
br = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
String line;
while ((line = br.readLine()) != null) {
int i = line.indexOf("|"); // ignore text after char (Solr stopwordfile syntax allows for # and | )
if (i>0) line = line.substring(0,i-1);
line = line.trim();
if (!line.isEmpty() && line.charAt(0) != '#') list.add(line.trim().toLowerCase());
}
br.close();
} catch (final IOException e) {
} finally {
if (br != null) try{br.close();}catch(final Exception e){}

@ -112,7 +112,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.protocol.http.ProxySettings;
import net.yacy.cora.storage.HandleSet;
import net.yacy.crawler.CrawlStacker;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.HarvestProcess;
@ -218,9 +217,9 @@ public final class Switchboard extends serverSwitch {
public static SortedSet<String> badwords = new TreeSet<String>(NaturalOrder.naturalComparator);
public static SortedSet<String> stopwords = new TreeSet<String>(NaturalOrder.naturalComparator);
public static SortedSet<String> blueList = null;
public static HandleSet badwordHashes = null;
public static HandleSet blueListHashes = null;
public static HandleSet stopwordHashes = null;
// public static HandleSet badwordHashes = null; // not used 2013-06-06
// public static HandleSet blueListHashes = null; // not used 2013-06-06
public static SortedSet<byte[]> stopwordHashes = null;
public static Blacklist urlBlacklist = null;
public static WikiParser wikiParser = null;
@ -579,7 +578,7 @@ public final class Switchboard extends serverSwitch {
} else {
blueList = new TreeSet<String>();
}
blueListHashes = Word.words2hashesHandles(blueList);
// blueListHashes = Word.words2hashesHandles(blueList);
this.log.logConfig("loaded blue-list from file "
+ plasmaBlueListFile.getName()
+ ", "
@ -601,7 +600,7 @@ public final class Switchboard extends serverSwitch {
if ( badwords == null || badwords.isEmpty() ) {
final File badwordsFile = new File(appPath, SwitchboardConstants.LIST_BADWORDS_DEFAULT);
badwords = SetTools.loadList(badwordsFile, NaturalOrder.naturalComparator);
badwordHashes = Word.words2hashesHandles(badwords);
// badwordHashes = Word.words2hashesHandles(badwords);
this.log.logConfig("loaded badwords from file "
+ badwordsFile.getName()
+ ", "
@ -614,7 +613,20 @@ public final class Switchboard extends serverSwitch {
if ( stopwords == null || stopwords.isEmpty() ) {
final File stopwordsFile = new File(appPath, SwitchboardConstants.LIST_STOPWORDS_DEFAULT);
stopwords = SetTools.loadList(stopwordsFile, NaturalOrder.naturalComparator);
stopwordHashes = Word.words2hashesHandles(stopwords);
// append locale language stopwords using setting of interface language (file yacy.stopwords.xx)
//TODO: append / share Solr stopwords.txt
final File stopwordsFilelocale = new File (stopwordsFile.getAbsolutePath()+"."+this.getConfig("locale.language","default"));
if (stopwordsFilelocale.exists()) {
stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator));
}
if (!stopwords.isEmpty()) {
stopwordHashes = new TreeSet<byte[]>(NaturalOrder.naturalOrder);
for (final String wordstr : stopwords) {
stopwordHashes.add(Word.word2hash(wordstr));
}
}
this.log.logConfig("loaded stopwords from file "
+ stopwordsFile.getName()
+ ", "

@ -385,15 +385,17 @@ public final class SearchEvent {
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
HandleSet filtered;
try {
filtered = RowHandleSet.joinConstructive(query.getQueryGoal().getIncludeHashes(), Switchboard.stopwordHashes);
} catch (final SpaceExceededException e) {
Log.logException(e);
filtered = new RowHandleSet(query.getQueryGoal().getIncludeHashes().keylen(), query.getQueryGoal().getIncludeHashes().comparator(), 0);
boolean filtered = false;
// check if query contains stopword
Iterator<byte[]> it = query.getQueryGoal().getIncludeHashes().iterator();
while (it.hasNext()) {
if (Switchboard.stopwordHashes.contains((it.next()))) {
filtered = true;
break;
}
}
this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone();
if (filtered != null && !filtered.isEmpty()) {
if (filtered) { // remove stopwords
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
}

@ -0,0 +1,3 @@
# Default stopword list (always loaded)
# a configured language specific stopword list is appended (like: yacy.stopwords.de)
#
Loading…
Cancel
Save