From deda54d6844e134eee36873009b1a98f2ca2d1bb Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 5 May 2011 22:37:06 +0000 Subject: [PATCH] - relaxed matching of string-search (this is now case-insensitive) - added transport of string-search pattern to remote search protocol - fixed a problem parsing snippets with a '-' inside git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7700 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ConfigParser.html | 3 ++- htroot/Table_API_p.java | 5 +++-- htroot/yacy/search.java | 4 ++-- source/de/anomic/search/QueryParams.java | 15 +++++++------ source/de/anomic/search/SearchEvent.java | 1 + source/de/anomic/search/TextSnippet.java | 19 ++++++++++++++--- source/de/anomic/yacy/yacyClient.java | 14 ++++++++----- source/de/anomic/yacy/yacySearch.java | 21 ++++++++++++------- source/net/yacy/document/Condenser.java | 1 + .../kelondro/rwi/ReferenceContainerCache.java | 1 - 10 files changed, 56 insertions(+), 28 deletions(-) diff --git a/htroot/ConfigParser.html b/htroot/ConfigParser.html index f018338ca..11c39bfb3 100644 --- a/htroot/ConfigParser.html +++ b/htroot/ConfigParser.html @@ -13,7 +13,8 @@

With this settings you can activate or deactivate parsing of additional content-types based on their MIME-types.
For a detailed description of the various MIME-types take a look at - http://www.iana.org/assignments/media-types/ + http://www.iana.org/assignments/media-types/.
+ If you want to test a specific parser you can do so using the File Viewer.

diff --git a/htroot/Table_API_p.java b/htroot/Table_API_p.java index 4f553bb2b..754c35ad7 100644 --- a/htroot/Table_API_p.java +++ b/htroot/Table_API_p.java @@ -35,6 +35,7 @@ import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import de.anomic.data.WorkTables; +import de.anomic.search.QueryParams; import de.anomic.search.Switchboard; import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; @@ -51,7 +52,7 @@ public class Table_API_p { int startRecord = 0; int maximumRecords = 25; - Pattern query = Pattern.compile(".*"); + Pattern query = QueryParams.catchall_pattern; if (post != null && post.containsKey("startRecord")) startRecord = post.getInt("startRecord", 0); if (post != null && post.containsKey("maximumRecords")) maximumRecords = post.getInt("maximumRecords", 0); if (post != null && post.containsKey("query") && !post.get("query", "").isEmpty()) { @@ -63,7 +64,7 @@ public class Table_API_p { prop.put("inline", (inline) ? 1 : 0); - Pattern typefilter = Pattern.compile(".*"); + Pattern typefilter = QueryParams.catchall_pattern; if (post != null && post.containsKey("filter") && post.get("filter", "").length() > 0) { typefilter = Pattern.compile(post.get("filter", ".*")); } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 0d47d76fd..f9ffe683d 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -102,8 +102,8 @@ public final class search { final int maxdist= post.getInt("maxdist", Integer.MAX_VALUE); final String prefer = post.get("prefer", ""); final String contentdom = post.get("contentdom", "text"); - final String filter = post.get("filter", ".*"); - final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); + final String filter = post.get("filter", ".*"); // a filter on the url + final Pattern snippetPattern = Pattern.compile(post.get("snippet", ".*")); // a filter on the snippet String sitehash = post.get("sitehash", ""); if (sitehash.length() == 0) sitehash = null; String authorhash = post.get("authorhash", ""); if (authorhash.length() == 0) authorhash = null; String language = post.get("language", ""); diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 3c5d409d5..4cfc2df17 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -133,7 +133,7 @@ public final class QueryParams { this.excludeHashes = Word.words2hashesHandles(cq[1]); this.fullqueryHashes = Word.words2hashesHandles(cq[2]); } - this.snippetMatcher = Pattern.compile(".*"); + this.snippetMatcher = QueryParams.catchall_pattern; this.ranking = ranking; this.tenant = null; this.maxDistance = Integer.MAX_VALUE; @@ -540,22 +540,25 @@ public final class QueryParams { } private static Pattern StringMatchPattern = Pattern.compile(".*?(\".*?\").*"); - /** * calculate a pattern to match with a string search * @param query * @return */ public static Pattern stringSearchPattern(String query) { - String p = ""; + StringBuilder p = new StringBuilder(query.length()); + p.append("(?iu)"); + int seqc = 0; while (query.length() > 0) { Matcher m = StringMatchPattern.matcher(query); if (!m.matches()) break; - p += ".*" + query.substring(m.start(1) + 1, m.end(1) - 1); + p.append(".*?").append(query.substring(m.start(1) + 1, m.end(1) - 1)); query = query.substring(m.end(1)); + seqc++; } - p += ".*"; - return Pattern.compile(p); + if (seqc == 0) return QueryParams.catchall_pattern; + p.append(".*"); + return Pattern.compile(p.toString()); } public static void main(String[] args) { diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index c9afc0ea0..63d95af76 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -128,6 +128,7 @@ public final class SearchEvent { QueryParams.hashSet2hashString(query.excludeHashes), query.prefer, query.urlMask, + query.snippetMatcher, query.targetlang == null ? "" : query.targetlang, query.sitehash == null ? "" : query.sitehash, query.authorhash == null ? "" : query.authorhash, diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 4ecc8a39a..8f3e70e4a 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -132,11 +132,23 @@ public class TextSnippet implements Comparable, Comparator, Comparator i = queryHashes.iterator(); byte[] h; - final String[] words = line.split(" "); + final String[] words = splitPattern.split(line); while (i.hasNext()) { h = i.next(); for (int j = 0; j < words.length; j++) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index c82f81cd4..0c6f6d2ad 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -409,6 +409,7 @@ public final class yacyClient { final String urlhashes, final Pattern prefer, final Pattern filter, + final Pattern snippet, final String language, final String sitehash, final String authorhash, @@ -445,7 +446,7 @@ public final class yacyClient { try { result = new SearchResult( yacyNetwork.basicRequestParts(Switchboard.getSwitchboard(), target.hash, crypt.randomSalt()), - mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, language, + mySeed, wordhashes, excludehashes, urlhashes, prefer, filter, snippet, language, sitehash, authorhash, count, time, maxDistance, global, partitions, target.getHexHash() + ".yacyh", target.getClusterAddress(), secondarySearchSuperviser, rankingProfile, constraint); } catch (final IOException e) { @@ -613,6 +614,7 @@ public final class yacyClient { final String urlhashes, final Pattern prefer, final Pattern filter, + final Pattern snippet, final String language, final String sitehash, final String authorhash, @@ -659,8 +661,9 @@ public final class yacyClient { parts.put("exclude", UTF8.StringBody(excludehashes)); parts.put("duetime", UTF8.StringBody("1000")); parts.put("urls", UTF8.StringBody(urlhashes)); - parts.put("prefer", UTF8.StringBody(prefer.toString())); - parts.put("filter", UTF8.StringBody(filter.toString())); + parts.put("prefer", UTF8.StringBody(prefer.pattern())); + parts.put("filter", UTF8.StringBody(filter.pattern())); + parts.put("snippet", UTF8.StringBody(snippet.pattern())); parts.put("language", UTF8.StringBody(language)); parts.put("sitehash", UTF8.StringBody(sitehash)); parts.put("authorhash", UTF8.StringBody(authorhash)); @@ -1073,8 +1076,9 @@ public final class yacyClient { UTF8.String(wordhashe), "", // excludehashes, "", // urlhashes, - Pattern.compile(""), // prefer, - Pattern.compile(".*"), // filter, + QueryParams.matchnothing_pattern, // prefer, + QueryParams.catchall_pattern, // filter, + QueryParams.catchall_pattern, // snippet, "", // language, "", // sitehash, "", // authorhash, diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 7213930b9..dd7be2651 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -57,7 +57,7 @@ public class yacySearch extends Thread { private final int count, maxDistance; private final long time; final private RankingProfile rankingProfile; - final private Pattern prefer, filter; + final private Pattern prefer, filter, snippet; final private String language; final private Bitfield constraint; final private yacySeedDB peers; @@ -65,7 +65,9 @@ public class yacySearch extends Thread { public yacySearch( final String wordhashes, final String excludehashes, final String urlhashes, - final Pattern prefer, final Pattern filter, + final Pattern prefer, + final Pattern filter, + final Pattern snippet, final String language, final String sitehash, final String authorhash, final int count, final long time, final int maxDistance, @@ -86,6 +88,7 @@ public class yacySearch extends Thread { this.urlhashes = urlhashes; this.prefer = prefer; this.filter = filter; + this.snippet = snippet; this.language = language; this.sitehash = sitehash; this.authorhash = authorhash; @@ -110,8 +113,9 @@ public class yacySearch extends Thread { try { this.urls = yacyClient.search( peers.mySeed(), - wordhashes, excludehashes, urlhashes, prefer, filter, language, - sitehash, authorhash, + wordhashes, excludehashes, urlhashes, + prefer, filter, snippet, + language, sitehash, authorhash, count, time, maxDistance, global, partitions, targetPeer, indexSegment, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); @@ -151,7 +155,8 @@ public class yacySearch extends Thread { public static yacySearch[] primaryRemoteSearches( final String wordhashes, final String excludehashes, - final Pattern prefer, final Pattern filter, String language, + final Pattern prefer, final Pattern filter, final Pattern snippet, + final String language, final String sitehash, final String authorhash, final int count, long time, final int maxDist, @@ -188,8 +193,8 @@ public class yacySearch extends Thread { if (targetPeers[i] == null || targetPeers[i].hash == null) continue; try { searchThreads[i] = new yacySearch( - wordhashes, excludehashes, "", prefer, filter, language, - sitehash, authorhash, + wordhashes, excludehashes, "", prefer, filter, snippet, + language, sitehash, authorhash, count, time, maxDist, true, targets, targetPeers[i], indexSegment, peers, containerCache, secondarySearchSuperviser, blacklist, rankingProfile, constraint); searchThreads[i].start(); @@ -222,7 +227,7 @@ public class yacySearch extends Thread { if (targetPeer == null || targetPeer.hash == null) return null; if (clusterselection != null) targetPeer.setAlternativeAddress(clusterselection.get(UTF8.getBytes(targetPeer.hash))); final yacySearch searchThread = new yacySearch( - wordhashes, "", urlhashes, Pattern.compile(""), Pattern.compile(".*"), "", "", "", 20, time, 9999, true, 0, targetPeer, + wordhashes, "", urlhashes, QueryParams.matchnothing_pattern, QueryParams.catchall_pattern, QueryParams.catchall_pattern, "", "", "", 20, time, 9999, true, 0, targetPeer, indexSegment, peers, containerCache, null, blacklist, rankingProfile, constraint); searchThread.start(); return searchThread; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 00e7c9538..9830a5235 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -102,6 +102,7 @@ public final class Condenser { final boolean indexMedia, final WordCache meaningLib ) { + Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging // if addMedia == true, then all the media links are also parsed and added to the words // added media words are flagged with the appropriate media flag this.intStringFormatter.setMinimumIntegerDigits(numlength); diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java index 305bbd89c..e037467a4 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java @@ -392,7 +392,6 @@ public final class ReferenceContainerCache exte public void add(final ReferenceContainer container) throws RowSpaceExceededException { // this puts the entries into the cache - assert this.cache != null; if (this.cache == null || container == null || container.isEmpty()) return; // put new words into cache