From 7480e87386b630b4680862045740d030fdc326a5 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 6 Jun 2013 22:07:54 +0200 Subject: [PATCH] - fix stopword handling for RWI see example http://bugs.yacy.net/view.php?id=247 - append language setting specific stopword list - remove unused OVERHANG stack type --- htroot/yacysearch.java | 4 +-- source/net/yacy/cora/storage/HandleSet.java | 6 +++-- source/net/yacy/crawler/data/CrawlQueues.java | 21 +++++++-------- source/net/yacy/crawler/data/NoticedURL.java | 3 +-- .../net/yacy/kelondro/index/RowHandleSet.java | 16 +++++++++--- source/net/yacy/kelondro/util/SetTools.java | 3 ++- source/net/yacy/search/Switchboard.java | 26 ++++++++++++++----- source/net/yacy/search/query/SearchEvent.java | 20 +++++++------- yacy.stopwords | 3 +++ 9 files changed, 64 insertions(+), 38 deletions(-) diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index bf9b29e16..f984bc4ba 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -525,9 +525,9 @@ public class yacysearch { final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllHashes().size() - 1 : Integer.MAX_VALUE; // filter out stopwords - final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); + final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); //find matching stopwords if ( !filtered.isEmpty() ) { - SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), Switchboard.stopwords); + SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), filtered); //remove stopwords } // if a minus-button was hit, remove a special reference first diff --git a/source/net/yacy/cora/storage/HandleSet.java b/source/net/yacy/cora/storage/HandleSet.java index 40a9a311b..0eb4009de 100644 --- a/source/net/yacy/cora/storage/HandleSet.java +++ b/source/net/yacy/cora/storage/HandleSet.java @@ -24,6 +24,7 @@ import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Iterator; +import java.util.Set; import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.CloneableIterator; @@ -88,8 +89,9 @@ public interface HandleSet extends Iterable, Cloneable, Serializable { public CloneableIterator keys(final boolean up, final byte[] firstKey); - public void excludeDestructive(final HandleSet other); - + // public void excludeDestructive(final HandleSet other); + public void excludeDestructive(final Set other); // used for stopwordhashes etc. + @Override public Iterator iterator(); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 35e77b5c4..8a8a771d9 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -218,12 +218,12 @@ public class CrawlQueues { public int coreCrawlJobSize() { return this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD); } - - public boolean coreCrawlJob() { + + public boolean coreCrawlJob() { final boolean robinsonPrivateCase = (this.sb.isRobinsonMode() && !this.sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER)); - if ((robinsonPrivateCase || coreCrawlJobSize() <= 20) && limitCrawlJobSize() > 0) { + if ((robinsonPrivateCase || coreCrawlJobSize() <= 20) && limitCrawlJobSize() > 0) { // move some tasks to the core crawl job so we have something to do final int toshift = Math.min(10, limitCrawlJobSize()); // this cannot be a big number because the balancer makes a forced waiting if it cannot balance for (int i = 0; i < toshift; i++) { @@ -256,8 +256,7 @@ public class CrawlQueues { final String stats = "LOCALCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + - this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { if (this.noticeURL.stackSize(NoticedURL.StackType.NOLOAD) > 0) { @@ -444,11 +443,11 @@ public class CrawlQueues { } return false; } - + if (coreCrawlJobSize() > 0 /*&& sb.indexingStorageProcessor.queueSize() > 0*/) { if (this.log.isFine()) { this.log.logFine("remoteCrawlLoaderJob: a local crawl is running, omitting processing"); - } + } return false; } @@ -585,7 +584,7 @@ public class CrawlQueues { } // we don't want to crawl a global URL globally, since WE are the global part. (from this point of view) - final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.OVERHANG) + ", " + final String stats = "REMOTETRIGGEREDCRAWL[" + this.noticeURL.stackSize(NoticedURL.StackType.LOCAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.GLOBAL) + ", " + this.noticeURL.stackSize(NoticedURL.StackType.REMOTE) + "]"; try { final Request urlEntry = this.noticeURL.pop(NoticedURL.StackType.REMOTE, true, this.sb.crawler, this.sb.robots); @@ -700,7 +699,7 @@ public class CrawlQueues { this.request.setStatus("worker-exception", WorkflowJob.STATUS_FINISHED); } finally { CrawlQueues.this.workers.remove(this.code); - } - } - } + } + } + } } diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 8a4f78db6..f41809cc5 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -48,7 +48,7 @@ import net.yacy.kelondro.logging.Log; public class NoticedURL { public enum StackType { - LOCAL, GLOBAL, OVERHANG, REMOTE, NOLOAD; + LOCAL, GLOBAL, REMOTE, NOLOAD; } private static final int minimumLocalDeltaInit = 10; // the minimum time difference between access of the same local domain @@ -152,7 +152,6 @@ public class NoticedURL { case NOLOAD: return (this.noloadStack == null) ? 0 : this.noloadStack.size(); case LOCAL: return (this.coreStack == null) ? 0 : this.coreStack.size(); case GLOBAL: return (this.limitStack == null) ? 0 : this.limitStack.size(); - case OVERHANG: return 0; case REMOTE: return (this.remoteStack == null) ? 0 : this.remoteStack.size(); default: return -1; } diff --git a/source/net/yacy/kelondro/index/RowHandleSet.java b/source/net/yacy/kelondro/index/RowHandleSet.java index d2203b5db..eb13e749b 100644 --- a/source/net/yacy/kelondro/index/RowHandleSet.java +++ b/source/net/yacy/kelondro/index/RowHandleSet.java @@ -36,6 +36,7 @@ import java.io.ObjectOutputStream; import java.io.OutputStream; import java.io.Serializable; import java.util.Iterator; +import java.util.Set; import net.yacy.cora.document.UTF8; import net.yacy.cora.order.ByteOrder; @@ -329,10 +330,17 @@ public final class RowHandleSet implements HandleSet, Iterable, Cloneabl } @Override - public void excludeDestructive(final HandleSet other) { - excludeDestructive(this, other); + public void excludeDestructive (final Set other) { + if (other == null) return; + if (other.isEmpty()) return; + + if (other.size() > this.size()) { + for (byte[] b: this) {if (other.contains(b)) this.remove(b);} + } else { + for (byte[] b: other) {this.remove(b) ;} + } } - +/* not used 2013-06-06 private static void excludeDestructive(final HandleSet set1, final HandleSet set2) { if (set1 == null) return; if (set2 == null) return; @@ -354,7 +362,7 @@ public final class RowHandleSet implements HandleSet, Iterable, Cloneabl final Iterator si = small.iterator(); while (si.hasNext()) large.remove(si.next()); } - +*/ public static void main(String[] args) { HandleSet s = new RowHandleSet(8, NaturalOrder.naturalOrder, 100); try { diff --git a/source/net/yacy/kelondro/util/SetTools.java b/source/net/yacy/kelondro/util/SetTools.java index 526a49964..7220a4d8c 100644 --- a/source/net/yacy/kelondro/util/SetTools.java +++ b/source/net/yacy/kelondro/util/SetTools.java @@ -527,10 +527,11 @@ public final class SetTools { br = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; while ((line = br.readLine()) != null) { + int i = line.indexOf("|"); // ignore text after char (Solr stopwordfile syntax allows for # and | ) + if (i>0) line = line.substring(0,i-1); line = line.trim(); if (!line.isEmpty() && line.charAt(0) != '#') list.add(line.trim().toLowerCase()); } - br.close(); } catch (final IOException e) { } finally { if (br != null) try{br.close();}catch(final Exception e){} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index c3075a748..a29ac0166 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -112,7 +112,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.TimeoutRequest; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.ProxySettings; -import net.yacy.cora.storage.HandleSet; import net.yacy.crawler.CrawlStacker; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.HarvestProcess; @@ -218,9 +217,9 @@ public final class Switchboard extends serverSwitch { public static SortedSet badwords = new TreeSet(NaturalOrder.naturalComparator); public static SortedSet stopwords = new TreeSet(NaturalOrder.naturalComparator); public static SortedSet blueList = null; - public static HandleSet badwordHashes = null; - public static HandleSet blueListHashes = null; - public static HandleSet stopwordHashes = null; +// public static HandleSet badwordHashes = null; // not used 2013-06-06 +// public static HandleSet blueListHashes = null; // not used 2013-06-06 + public static SortedSet stopwordHashes = null; public static Blacklist urlBlacklist = null; public static WikiParser wikiParser = null; @@ -579,7 +578,7 @@ public final class Switchboard extends serverSwitch { } else { blueList = new TreeSet(); } - blueListHashes = Word.words2hashesHandles(blueList); + // blueListHashes = Word.words2hashesHandles(blueList); this.log.logConfig("loaded blue-list from file " + plasmaBlueListFile.getName() + ", " @@ -601,7 +600,7 @@ public final class Switchboard extends serverSwitch { if ( badwords == null || badwords.isEmpty() ) { final File badwordsFile = new File(appPath, SwitchboardConstants.LIST_BADWORDS_DEFAULT); badwords = SetTools.loadList(badwordsFile, NaturalOrder.naturalComparator); - badwordHashes = Word.words2hashesHandles(badwords); +// badwordHashes = Word.words2hashesHandles(badwords); this.log.logConfig("loaded badwords from file " + badwordsFile.getName() + ", " @@ -614,7 +613,20 @@ public final class Switchboard extends serverSwitch { if ( stopwords == null || stopwords.isEmpty() ) { final File stopwordsFile = new File(appPath, SwitchboardConstants.LIST_STOPWORDS_DEFAULT); stopwords = SetTools.loadList(stopwordsFile, NaturalOrder.naturalComparator); - stopwordHashes = Word.words2hashesHandles(stopwords); + // append locale language stopwords using setting of interface language (file yacy.stopwords.xx) + //TODO: append / share Solr stopwords.txt + final File stopwordsFilelocale = new File (stopwordsFile.getAbsolutePath()+"."+this.getConfig("locale.language","default")); + if (stopwordsFilelocale.exists()) { + stopwords.addAll(SetTools.loadList(stopwordsFilelocale, NaturalOrder.naturalComparator)); + } + + if (!stopwords.isEmpty()) { + stopwordHashes = new TreeSet(NaturalOrder.naturalOrder); + for (final String wordstr : stopwords) { + stopwordHashes.add(Word.word2hash(wordstr)); + } + } + this.log.logConfig("loaded stopwords from file " + stopwordsFile.getName() + ", " diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 591c58563..f4e6a44e3 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -384,16 +384,18 @@ public final class SearchEvent { this.resultList = new WeakPriorityBlockingQueue(Math.max(1000, 10 * query.itemsPerPage()), true); // this is the result, enriched with snippets, ranked and ordered by ranking // snippets do not need to match with the complete query hashes, - // only with the query minus the stopwords which had not been used for the search - HandleSet filtered; - try { - filtered = RowHandleSet.joinConstructive(query.getQueryGoal().getIncludeHashes(), Switchboard.stopwordHashes); - } catch (final SpaceExceededException e) { - Log.logException(e); - filtered = new RowHandleSet(query.getQueryGoal().getIncludeHashes().keylen(), query.getQueryGoal().getIncludeHashes().comparator(), 0); + // only with the query minus the stopwords which had not been used for the search + boolean filtered = false; + // check if query contains stopword + Iterator it = query.getQueryGoal().getIncludeHashes().iterator(); + while (it.hasNext()) { + if (Switchboard.stopwordHashes.contains((it.next()))) { + filtered = true; + break; + } } - this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone(); - if (filtered != null && !filtered.isEmpty()) { + this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone(); + if (filtered) { // remove stopwords this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes); } diff --git a/yacy.stopwords b/yacy.stopwords index e69de29bb..8659daa67 100644 --- a/yacy.stopwords +++ b/yacy.stopwords @@ -0,0 +1,3 @@ +# Default stopword list (always loaded) +# a configured language specific stopword list is appended (like: yacy.stopwords.de) +#