From 5dfd6359cbd37d3c1bb27e59300fd01e07106a77 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 18 Nov 2012 01:22:41 +0100 Subject: [PATCH] redesign of the QueryParams class: introduced QueryGoal which holds the query string parser. This shall be used to create a proper full-string matching which is handled then by QueryGoal. --- htroot/AccessTracker_p.java | 4 +- htroot/api/timeline.java | 7 +- htroot/gsa/searchresult.java | 5 +- htroot/yacy/search.java | 25 +- htroot/yacysearch.java | 31 +-- htroot/yacysearchitem.java | 20 +- htroot/yacysearchtrailer.java | 16 +- .../document/parser/html/AbstractScraper.java | 8 +- .../document/parser/html/ContentScraper.java | 6 +- source/net/yacy/peers/Protocol.java | 2 +- source/net/yacy/peers/RemoteSearch.java | 6 +- .../net/yacy/peers/graphics/NetworkGraph.java | 3 +- source/net/yacy/search/index/Segment.java | 2 +- .../net/yacy/search/query/AccessTracker.java | 10 +- source/net/yacy/search/query/QueryGoal.java | 230 ++++++++++++++++++ source/net/yacy/search/query/QueryParams.java | 207 ++-------------- .../net/yacy/search/query/RankingProcess.java | 8 +- source/net/yacy/search/query/SearchEvent.java | 20 +- .../query/SecondarySearchSuperviser.java | 4 +- .../net/yacy/search/query/SnippetWorker.java | 4 +- .../net/yacy/search/snippet/TextSnippet.java | 4 +- 21 files changed, 334 insertions(+), 288 deletions(-) create mode 100644 source/net/yacy/search/query/QueryGoal.java diff --git a/htroot/AccessTracker_p.java b/htroot/AccessTracker_p.java index 5c9c2987e..eaf98ef12 100644 --- a/htroot/AccessTracker_p.java +++ b/htroot/AccessTracker_p.java @@ -165,11 +165,11 @@ public class AccessTracker_p { if (page == 2) { // local search prop.putNum("page_list_" + m + "_offset", query.offset); - prop.putHTML("page_list_" + m + "_querystring", query.queryString); + prop.putHTML("page_list_" + m + "_querystring", query.getQueryGoal().getQueryString()); } else { // remote search prop.putHTML("page_list_" + m + "_peername", (query.remotepeer == null) ? "" : query.remotepeer.getName()); - prop.put("page_list_" + m + "_queryhashes", QueryParams.anonymizedQueryHashes(query.query_include_hashes)); + prop.put("page_list_" + m + "_queryhashes", QueryParams.anonymizedQueryHashes(query.getQueryGoal().getIncludeHashes())); } prop.putNum("page_list_" + m + "_querycount", query.itemsPerPage); prop.putNum("page_list_" + m + "_transmitcount", query.transmitcount); diff --git a/htroot/api/timeline.java b/htroot/api/timeline.java index 04ee5950f..ce64564db 100644 --- a/htroot/api/timeline.java +++ b/htroot/api/timeline.java @@ -41,6 +41,7 @@ import net.yacy.kelondro.util.ISO639; import net.yacy.peers.Network; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; +import net.yacy.search.query.QueryGoal; import net.yacy.search.query.QueryParams; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -68,8 +69,8 @@ public final class timeline { language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); if (language == null) language = "en"; } - final List[] query = QueryParams.cleanQuery(querystring); // converts also umlaute - HandleSet q = Word.words2hashesHandles(query[0]); + final QueryGoal qg = new QueryGoal(querystring); + HandleSet q = qg.getIncludeHashes(); // tell all threads to do nothing for a specific time sb.intermissionAllThreads(3000); @@ -87,7 +88,7 @@ public final class timeline { // get the index container with the result vector TermSearch search = null; try { - search = segment.termIndex().query(q, Word.words2hashesHandles(query[1]), null, Segment.wordReferenceFactory, maxdist); + search = segment.termIndex().query(q, qg.getExcludeHashes(), null, Segment.wordReferenceFactory, maxdist); } catch (SpaceExceededException e) { Log.logException(e); } diff --git a/htroot/gsa/searchresult.java b/htroot/gsa/searchresult.java index 8a325ef0f..efdc40cef 100644 --- a/htroot/gsa/searchresult.java +++ b/htroot/gsa/searchresult.java @@ -36,6 +36,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import net.yacy.search.query.AccessTracker; +import net.yacy.search.query.QueryGoal; import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.server.serverObjects; @@ -110,8 +111,8 @@ public class searchresult { post.put("originalQuery", q); // get a solr query string - List[] cq = QueryParams.cleanQuery(q); - q = QueryParams.solrQueryString(cq[0], cq[1], sb.index.fulltext().getSolrScheme()).toString(); + QueryGoal qg = new QueryGoal(q); + q = qg.solrQueryString(sb.index.fulltext().getSolrScheme()).toString(); post.put(CommonParams.ROWS, post.remove("num")); post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100)); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 5d76c7b1b..2ae486344 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -68,6 +68,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import net.yacy.search.query.AccessTracker; +import net.yacy.search.query.QueryGoal; import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; @@ -219,12 +220,9 @@ public final class search { if (query.isEmpty() && abstractSet != null) { // this is _not_ a normal search, only a request for index abstracts final Segment indexSegment = sb.index; + QueryGoal qg = new QueryGoal(abstractSet, new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0), abstractSet); theQuery = new QueryParams( - null, - null, null, null, - abstractSet, - new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0), - abstractSet, + qg, modifier, maxdist, prefer, @@ -251,11 +249,11 @@ public final class search { header.get(RequestHeader.USER_AGENT, ""), false, 0.0d, 0.0d, 0.0d ); - Network.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.query_include_hashes) + " - " + theQuery.itemsPerPage() + " links"); + Network.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); final long timer = System.currentTimeMillis(); //final Map>[] containers = sb.indexSegment.index().searchTerm(theQuery.queryHashes, theQuery.excludeHashes, plasmaSearchQuery.hashes2StringSet(urls)); - final TreeMap> incc = indexSegment.termIndex().searchConjunction(theQuery.query_include_hashes, QueryParams.hashes2Handles(urls)); + final TreeMap> incc = indexSegment.termIndex().searchConjunction(theQuery.getQueryGoal().getIncludeHashes(), QueryParams.hashes2Handles(urls)); EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theQuery.id(true), SearchEventType.COLLECTION, "", incc.size(), System.currentTimeMillis() - timer), false); if (incc != null) { @@ -284,12 +282,9 @@ public final class search { RowHandleSet allHashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); try {allHashes.putAll(queryhashes);} catch (SpaceExceededException e) {} try {allHashes.putAll(excludehashes);} catch (SpaceExceededException e) {} + QueryGoal qg = new QueryGoal(queryhashes, excludehashes, allHashes); theQuery = new QueryParams( - null, - null, null, null, - queryhashes, - excludehashes, - allHashes, + qg, modifier, maxdist, prefer, @@ -316,8 +311,8 @@ public final class search { header.get(RequestHeader.USER_AGENT, ""), false, 0.0d, 0.0d, 0.0d ); - Network.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.query_include_hashes) + " - " + theQuery.itemsPerPage() + " links"); - EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.query_include_hashes), "")); + Network.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.itemsPerPage() + " links"); + EventChannel.channels(EventChannel.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()), "")); // make event theSearch = SearchEventCache.getEvent(theQuery, sb.peers, sb.tables, null, abstracts.length() > 0, sb.loader, count, maxtime, (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); @@ -433,7 +428,7 @@ public final class search { // log Network.log.logInfo("EXIT HASH SEARCH: " + - QueryParams.anonymizedQueryHashes(theQuery.query_include_hashes) + " - " + theQuery.getResultCount() + " links found, " + + QueryParams.anonymizedQueryHashes(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.getResultCount() + " links found, " + prop.get("linkcount", "?") + " links selected, " + indexabstractContainercount + " index abstracts, " + (System.currentTimeMillis() - timestamp) + " milliseconds"); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 9d814ea44..1504911a1 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -34,7 +34,6 @@ import java.util.Collection; import java.util.ConcurrentModificationException; import java.util.HashMap; import java.util.Iterator; -import java.util.List; import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; @@ -78,6 +77,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import net.yacy.search.query.AccessTracker; +import net.yacy.search.query.QueryGoal; import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; @@ -594,14 +594,13 @@ public class yacysearch { } // the query - final List[] query = QueryParams.cleanQuery(querystring.trim()); // converts also umlaute - - final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? query.length - 1 : Integer.MAX_VALUE; + final QueryGoal qg = new QueryGoal(querystring.trim()); + final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllWords().size() - 1 : Integer.MAX_VALUE; // filter out stopwords - final SortedSet filtered = SetTools.joinConstructiveByTest(query[0], Switchboard.stopwords); + final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeWords(), Switchboard.stopwords); if ( !filtered.isEmpty() ) { - SetTools.excludeDestructiveByTestSmallInLarge(query[0], Switchboard.stopwords); + SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeWords(), Switchboard.stopwords); } // if a minus-button was hit, remove a special reference first @@ -614,7 +613,7 @@ public class yacysearch { // delete the index entry locally final String delHash = post.get("deleteref", ""); // urlhash - indexSegment.termIndex().remove(Word.words2hashesHandles(query[0]), delHash.getBytes()); + indexSegment.termIndex().remove(qg.getIncludeHashes(), delHash.getBytes()); // make new news message with negative voting if ( !sb.isRobinsonMode() ) { @@ -715,13 +714,7 @@ public class yacysearch { // do the search final QueryParams theQuery = new QueryParams( - originalquerystring, - query[0], - query[1], - query[2], - Word.words2hashesHandles(query[0]), - Word.words2hashesHandles(query[1]), - Word.words2hashesHandles(query[2]), + qg, modifier.toString().trim(), maxDistance, prefermask, @@ -764,22 +757,22 @@ public class yacysearch { sb.intermissionAllThreads(3000); // filter out words that appear in bluelist - theQuery.filterOut(Switchboard.blueList); + theQuery.getQueryGoal().filterOut(Switchboard.blueList); // log Log.logInfo( "LOCAL_SEARCH", "INIT WORD SEARCH: " - + theQuery.queryString + + theQuery.getQueryGoal().getQueryString() + ":" - + QueryParams.hashSet2hashString(theQuery.query_include_hashes) + + QueryParams.hashSet2hashString(theQuery.getQueryGoal().getIncludeHashes()) + " - " + theQuery.neededResults() + " links to be computed, " + theQuery.itemsPerPage() + " lines to be displayed"); EventChannel.channels(EventChannel.LOCALSEARCH).addMessage( - new RSSMessage("Local Search Request", theQuery.queryString, "")); + new RSSMessage("Local Search Request", theQuery.getQueryGoal().getQueryString(), "")); final long timestamp = System.currentTimeMillis(); // create a new search event @@ -818,7 +811,7 @@ public class yacysearch { // log Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " - + theQuery.queryString + + theQuery.getQueryGoal().getQueryString() + " - " + "local_rwi_available(" + theSearch.query.local_rwi_available.get() + "), " + "local_rwi_stored(" + theSearch.query.local_rwi_stored.get() + "), " diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index d04611b45..9407261cc 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -25,7 +25,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.net.MalformedURLException; -import java.util.Collection; +import java.util.List; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; @@ -134,10 +134,10 @@ public class yacysearchitem { prop.put("content_authorized", authenticated ? "1" : "0"); final String urlhash = ASCII.String(result.hash()); prop.put("content_authorized_bookmark", sb.tables.bookmarks.hasBookmark("admin", urlhash) ? "0" : "1"); - prop.putHTML("content_authorized_bookmark_bookmarklink", "/yacysearch.html?query=" + theSearch.query.queryString.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&bookmarkref=" + urlhash + "&urlmaskfilter=.*"); + prop.putHTML("content_authorized_bookmark_bookmarklink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&bookmarkref=" + urlhash + "&urlmaskfilter=.*"); prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(NewsPool.OUTGOING_DB, NewsPool.CATEGORY_SURFTIPP_ADD, "url", resultUrlstring) == null) ? "1" : "0"); - prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theSearch.query.queryString.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*"); - prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theSearch.query.queryString.replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*"); + prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*"); + prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theSearch.query.getQueryGoal().getQueryString().replace(' ', '+') + "&Enter=Search&count=" + theSearch.query.itemsPerPage() + "&offset=" + (theSearch.query.neededResults() - theSearch.query.itemsPerPage()) + "&order=" + crypt.simpleEncode(theSearch.query.ranking.toExternalString()) + "&resource=" + resource + "&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*"); prop.put("content_authorized_urlhash", urlhash); final String resulthashString = urlhash; prop.putHTML("content_title", result.title()); @@ -206,18 +206,18 @@ public class yacysearchitem { prop.putHTML("content_publisher", result.publisher()); prop.putHTML("content_creator", result.creator());// author prop.putHTML("content_subject", result.subject()); - final Collection[] query = theSearch.query.queryWords(); - final StringBuilder s = new StringBuilder(query[0].size() * 20); - for (final String t: query[0]) { + final List query = theSearch.query.getQueryGoal().getAllWords(); + final StringBuilder s = new StringBuilder(query.size() * 20); + for (final String t: query) { s.append('+').append(t); } final String words = (s.length() > 0) ? s.substring(1) : ""; prop.putHTML("content_words", words); prop.putHTML("content_showParser_words", words); - prop.putHTML("content_former", theSearch.query.queryString); - prop.putHTML("content_showPictures_former", theSearch.query.queryString); + prop.putHTML("content_former", theSearch.query.getQueryGoal().getQueryString()); + prop.putHTML("content_showPictures_former", theSearch.query.getQueryGoal().getQueryString()); final TextSnippet snippet = result.textSnippet(); - final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(theSearch.query.query_all_hashes); + final String desc = (snippet == null) ? "" : snippet.isMarked() ? snippet.getLineRaw() : snippet.getLineMarked(theSearch.query.getQueryGoal()); prop.put("content_description", desc); prop.putXML("content_description-xml", desc); prop.putJSON("content_description-json", desc); diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index b41e8d054..0f5f993e0 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -79,7 +79,7 @@ public class yacysearchtrailer { count = theSearch.namespaceNavigator.get(name); if (count == 0) break; nav = "inurl%3A" + name; - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { pos++; @@ -119,7 +119,7 @@ public class yacysearchtrailer { count = hostNavigator.get(name); if (count == 0) break; nav = "site%3A" + name; - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { pos++; @@ -158,7 +158,7 @@ public class yacysearchtrailer { count = theSearch.authorNavigator.get(name); if (count == 0) break; nav = (name.indexOf(' ', 0) < 0) ? "author%3A" + name : "author%3A%28" + name.replace(" ", "+") + "%29"; - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { pos++; @@ -197,9 +197,9 @@ public class yacysearchtrailer { name = navigatorIterator.next(); count = topicNavigator.get(name); if (count == 0) break; - if (theSearch.query.queryString == null) break; + if (theSearch.query.getQueryGoal().getQueryString() == null) break; if (name != null) { - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); prop.put("nav-topics_element_" + i + "_on", 1); prop.put(fileType, "nav-topics_element_" + i + "_modifier", name); prop.put(fileType, "nav-topics_element_" + i + "_name", name); @@ -227,7 +227,7 @@ public class yacysearchtrailer { count = theSearch.protocolNavigator.get(name); if (count == 0) break; nav = "%2F" + name; - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { pos++; @@ -266,7 +266,7 @@ public class yacysearchtrailer { count = theSearch.filetypeNavigator.get(name); if (count == 0) break; nav = "filetype%3A" + name; - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { pos++; @@ -310,7 +310,7 @@ public class yacysearchtrailer { count = ve.getValue().get(name); if (count == 0) break; nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Tagging.encodePrintname(name)).toString(); - queryStringForUrl = theSearch.query.queryStringForUrl(); + queryStringForUrl = theSearch.query.getQueryGoal().queryStringForUrl(); p = queryStringForUrl.indexOf(nav); if (p < 0) { queryStringForUrl += "+" + nav; diff --git a/source/net/yacy/document/parser/html/AbstractScraper.java b/source/net/yacy/document/parser/html/AbstractScraper.java index ff0c404e6..d3bc2ffeb 100644 --- a/source/net/yacy/document/parser/html/AbstractScraper.java +++ b/source/net/yacy/document/parser/html/AbstractScraper.java @@ -77,8 +77,8 @@ public abstract class AbstractScraper implements Scraper { @Override public abstract void scrapeTag1(String tagname, Properties tagopts, char[] text); - protected static String stripAllTags(final char[] s) { - if (!MemoryControl.request(s.length * 2, false)) return ""; + public static String stripAllTags(final char[] s) { + if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return ""; final StringBuilder r = new StringBuilder(s.length); int bc = 0; for (final char c : s) { @@ -114,10 +114,6 @@ public abstract class AbstractScraper implements Scraper { return sb.toString().trim(); } - public static String stripAll(final char[] s) { - return CharacterCoding.html2unicode(stripAllTags(s)); - } - @Override public void close() { // free resources diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 5c724c191..502e63da3 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -535,7 +535,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } private String recursiveParse(final char[] inlineHtml) { - if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml)); + if (inlineHtml.length < 14) return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); // start a new scraper to parse links inside this text // parsing the content @@ -545,7 +545,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { FileUtils.copy(new CharArrayReader(inlineHtml), writer); } catch (final IOException e) { Log.logException(e); - return cleanLine(super.stripAll(inlineHtml)); + return cleanLine(CharacterCoding.html2unicode(stripAllTags(inlineHtml))); } finally { try { writer.close(); @@ -557,7 +557,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } this.images.putAll(scraper.images); - String line = cleanLine(super.stripAll(scraper.content.getChars())); + String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); scraper.close(); return line; } diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 3916a9d0b..2bfbbf8cd 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1033,7 +1033,7 @@ public final class Protocol final Seed target, final Blacklist blacklist) { - if (event.query.queryString == null || event.query.queryString.length() == 0) { + if (event.query.getQueryGoal().getQueryString() == null || event.query.getQueryGoal().getQueryString().length() == 0) { return -1; // we cannot query solr only with word hashes, there is no clear text string } event.addExpectedRemoteReferences(count); diff --git a/source/net/yacy/peers/RemoteSearch.java b/source/net/yacy/peers/RemoteSearch.java index b62b620f1..7e0873ae9 100644 --- a/source/net/yacy/peers/RemoteSearch.java +++ b/source/net/yacy/peers/RemoteSearch.java @@ -149,7 +149,7 @@ public class RemoteSearch extends Thread { (clusterselection == null) ? DHTSelection.selectSearchTargets( event.peers, - event.query.query_include_hashes, + event.query.getQueryGoal().getIncludeHashes(), event.peers.redundancy(), burstRobinsonPercent, burstMultiwordPercent) @@ -172,8 +172,8 @@ public class RemoteSearch extends Thread { try { RemoteSearch rs = new RemoteSearch( event, - QueryParams.hashSet2hashString(event.query.query_include_hashes), - QueryParams.hashSet2hashString(event.query.query_exclude_hashes), + QueryParams.hashSet2hashString(event.query.getQueryGoal().getIncludeHashes()), + QueryParams.hashSet2hashString(event.query.getQueryGoal().getExcludeHashes()), event.query.modifier, event.query.targetlang == null ? "" : event.query.targetlang, event.query.nav_sitehash == null ? "" : event.query.nav_sitehash, diff --git a/source/net/yacy/peers/graphics/NetworkGraph.java b/source/net/yacy/peers/graphics/NetworkGraph.java index 87eff83f0..5b6e52112 100644 --- a/source/net/yacy/peers/graphics/NetworkGraph.java +++ b/source/net/yacy/peers/graphics/NetworkGraph.java @@ -45,7 +45,6 @@ import net.yacy.peers.Seed; import net.yacy.peers.SeedDB; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; -import net.yacy.search.query.QueryParams; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.visualization.PrintTool; @@ -157,7 +156,7 @@ public class NetworkGraph { */ // draw in the search target - final Iterator i = event.query.query_include_hashes.iterator(); + final Iterator i = event.query.getQueryGoal().getIncludeHashes().iterator(); eventPicture.setColor(RasterPlotter.GREY); while (i.hasNext()) { byte[] wordHash = i.next(); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index cc9ea86f9..0b5d69478 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -428,7 +428,7 @@ public class Segment { // data during search-time. To transfer indexed data directly to the search process // the following lines push the index data additionally to the search process // this is done only for searched words - if (searchEvent != null && !searchEvent.query.query_exclude_hashes.has(wordhash) && searchEvent.query.query_include_hashes.has(wordhash)) { + if (searchEvent != null && !searchEvent.query.getQueryGoal().getExcludeHashes().has(wordhash) && searchEvent.query.getQueryGoal().getIncludeHashes().has(wordhash)) { // if the page was added in the context of a heuristic this shall ensure that findings will fire directly into the search result ReferenceContainer container; try { diff --git a/source/net/yacy/search/query/AccessTracker.java b/source/net/yacy/search/query/AccessTracker.java index def62139e..6f84a98df 100644 --- a/source/net/yacy/search/query/AccessTracker.java +++ b/source/net/yacy/search/query/AccessTracker.java @@ -67,9 +67,9 @@ public class AccessTracker { private static void add(final LinkedList list, final QueryParams query) { // learn that this word can be a word completion for the DidYouMeanLibrary - if (query.getResultCount() > 10 && query.queryString != null && query.queryString.length() > 0) { - final StringBuilder sb = new StringBuilder(query.queryString); - sb.append(query.queryString); + if (query.getResultCount() > 10 && query.getQueryGoal().getQueryString() != null && query.getQueryGoal().getQueryString().length() > 0) { + final StringBuilder sb = new StringBuilder(query.getQueryGoal().getQueryString()); + sb.append(query.getQueryGoal().getQueryString()); WordCache.learn(sb); } @@ -108,8 +108,8 @@ public class AccessTracker { } private static void addToDump(final QueryParams query) { - if (query.queryString == null || query.queryString.isEmpty()) return; - addToDump(query.queryString, Integer.toString(query.getResultCount()), new Date(query.starttime)); + if (query.getQueryGoal().getQueryString() == null || query.getQueryGoal().getQueryString().isEmpty()) return; + addToDump(query.getQueryGoal().getQueryString(), Integer.toString(query.getResultCount()), new Date(query.starttime)); } public static void addToDump(String querystring, String resultcount) { diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java new file mode 100644 index 000000000..922716531 --- /dev/null +++ b/source/net/yacy/search/query/QueryGoal.java @@ -0,0 +1,230 @@ +/** + * QueryGoal + * Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * First published 16.11.2005 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.search.query; + +import java.io.UnsupportedEncodingException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.SortedSet; + +import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.solr.YaCySchema; +import net.yacy.cora.order.Base64Order; +import net.yacy.cora.storage.HandleSet; +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.document.Condenser; +import net.yacy.document.parser.html.AbstractScraper; +import net.yacy.document.parser.html.CharacterCoding; +import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.data.word.WordReferenceRow; +import net.yacy.kelondro.index.RowHandleSet; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.index.Segment; +import net.yacy.search.index.SolrConfiguration; + +public class QueryGoal { + + private static String seps = "'.,/&_"; static {seps += '"';} + + private String querystring; + private HandleSet include_hashes, exclude_hashes, all_hashes; + private final ArrayList include_words, exclude_words, all_words; + + public QueryGoal(HandleSet include_hashes, HandleSet exclude_hashes, HandleSet all_hashes) { + this.querystring = null; + this.include_words = null; + this.exclude_words = null; + this.all_words = null; + this.include_hashes = include_hashes; + this.exclude_hashes = exclude_hashes; + this.all_hashes = all_hashes; + } + public QueryGoal(String querystring) { + this.querystring = querystring; + this.include_words = new ArrayList(); + this.exclude_words = new ArrayList(); + this.all_words = new ArrayList(); + byte[] queryHash; + if ((querystring.length() == 12) && (Base64Order.enhancedCoder.wellformed(queryHash = UTF8.getBytes(querystring)))) { + this.querystring = null; + this.include_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); + this.exclude_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); + this.all_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); + try { + this.include_hashes.put(queryHash); + this.all_hashes.put(queryHash); + } catch (final SpaceExceededException e) { + Log.logException(e); + } + } else if ((querystring != null) && (!querystring.isEmpty())) { + + // remove funny symbols + querystring = CharacterCoding.html2unicode(AbstractScraper.stripAllTags(querystring.toCharArray())).toLowerCase().trim(); + int c; + for (int i = 0; i < seps.length(); i++) { + while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { + querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (' ' + querystring.substring(c + 1)) : ""); + } + } + + String s; + int l; + // the string is clean now, but we must generate a set out of it + final String[] queries = querystring.split(" "); + for (String quer : queries) { + if (quer.startsWith("-")) { + String x = quer.substring(1); + if (!exclude_words.contains(x)) exclude_words.add(x); + } else { + while ((c = quer.indexOf('-')) >= 0) { + s = quer.substring(0, c); + l = s.length(); + if (l >= Condenser.wordminsize && !include_words.contains(s)) {include_words.add(s);} + if (l > 0 && !all_words.contains(s)) {all_words.add(s);} + quer = quer.substring(c + 1); + } + l = quer.length(); + if (l >= Condenser.wordminsize && !include_words.contains(quer)) {include_words.add(quer);} + if (l > 0 && !all_words.contains(quer)) {all_words.add(quer);} + } + } + } + + this.include_hashes = null; + this.exclude_hashes = null; + this.all_hashes = null; + } + + public String getQueryString() { + return this.querystring; + } + + public String queryStringForUrl() { + try { + return URLEncoder.encode(this.querystring, "UTF-8"); + } catch (final UnsupportedEncodingException e) { + Log.logException(e); + return this.querystring; + } + } + + public HandleSet getIncludeHashes() { + if (include_hashes == null) include_hashes = Word.words2hashesHandles(include_words); + return include_hashes; + } + + public HandleSet getExcludeHashes() { + if (exclude_hashes == null) exclude_hashes = Word.words2hashesHandles(exclude_words); + return exclude_hashes; + } + + public HandleSet getAllHashes() { + if (all_hashes == null) all_hashes = Word.words2hashesHandles(all_words); + return all_hashes; + } + + public ArrayList getIncludeWords() { + return include_words; + } + + public ArrayList getExcludeWords() { + return exclude_words; + } + + public ArrayList getAllWords() { + return all_words; + } + + public void filterOut(final SortedSet blueList) { + // filter out words that appear in this set + // this is applied to the queryHashes + final HandleSet blues = Word.words2hashesHandles(blueList); + for (final byte[] b: blues) this.include_hashes.remove(b); + } + + private final static YaCySchema[] fields = new YaCySchema[]{ + YaCySchema.sku,YaCySchema.title,YaCySchema.h1_txt,YaCySchema.h2_txt, + YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt + }; + + private final static Map boosts = new LinkedHashMap(); + static { + boosts.put(YaCySchema.sku, 20.0f); + boosts.put(YaCySchema.url_paths_sxt, 20.0f); + boosts.put(YaCySchema.title, 15.0f); + boosts.put(YaCySchema.h1_txt, 11.0f); + boosts.put(YaCySchema.h2_txt, 10.0f); + boosts.put(YaCySchema.author, 8.0f); + boosts.put(YaCySchema.description, 5.0f); + boosts.put(YaCySchema.keywords, 2.0f); + boosts.put(YaCySchema.text_t, 1.0f); + } + + public StringBuilder solrQueryString(SolrConfiguration configuration) { + final StringBuilder q = new StringBuilder(80); + + // parse special requests + if (include_words.size() == 1 && exclude_words.size() == 0) { + String w = include_words.get(0); + if (Segment.catchallString.equals(w)) return new StringBuilder("*:*"); + } + + // add text query + int wc = 0; + StringBuilder w = new StringBuilder(80); + for (String s: include_words) { + if (wc > 0) w.append(" AND "); + w.append(s); + wc++; + } + for (String s: exclude_words){ + if (wc > 0) w.append(" AND -"); + w.append(s); + wc++; + } + if (wc > 1) {w.insert(0, '('); w.append(')');} + + // combine these queries for all relevant fields + wc = 0; + Float boost; + for (YaCySchema field: fields) { + if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue; + if (wc > 0) q.append(" OR "); + q.append('('); + q.append(field.getSolrFieldName()).append(':').append(w); + boost = boosts.get(field); + if (boost != null) q.append('^').append(boost.toString()); + q.append(')'); + wc++; + } + q.insert(0, '('); + q.append(')'); + + // add filter to prevent that results come from failed urls + q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]"); + + return q; + } + +} diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index 6c914c776..5bd1e6875 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -26,15 +26,11 @@ package net.yacy.search.query; -import java.io.UnsupportedEncodingException; -import java.net.URLEncoder; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; -import java.util.LinkedHashMap; -import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; @@ -50,7 +46,6 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -59,7 +54,6 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Condenser; -import net.yacy.document.parser.html.AbstractScraper; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; @@ -71,7 +65,6 @@ import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.SetTools; import net.yacy.peers.Seed; import net.yacy.search.index.Segment; -import net.yacy.search.index.SolrConfiguration; import net.yacy.search.ranking.RankingProfile; public final class QueryParams { @@ -113,9 +106,7 @@ public final class QueryParams { public static final Pattern catchall_pattern = Pattern.compile(".*"); private static final Pattern matchnothing_pattern = Pattern.compile(""); - public final String queryString; - public final HandleSet query_include_hashes, query_exclude_hashes, query_all_hashes; - private final List query_include_words, query_exclude_words, query_all_words; + public final QueryGoal queryGoal; public int itemsPerPage; public int offset; public final Pattern urlMask, prefer; @@ -167,31 +158,7 @@ public final class QueryParams { final Segment indexSegment, final RankingProfile ranking, final String userAgent) { - byte[] queryHash; - if ((queryString.length() == 12) && (Base64Order.enhancedCoder.wellformed(queryHash = UTF8.getBytes(queryString)))) { - this.queryString = null; - this.query_include_words = null; - this.query_exclude_words = null; - this.query_all_words = null; - this.query_include_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); - this.query_exclude_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); - this.query_all_hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, 0); - try { - this.query_include_hashes.put(queryHash); - this.query_all_hashes.put(queryHash); - } catch (final SpaceExceededException e) { - Log.logException(e); - } - } else { - this.queryString = queryString; - final List[] cq = cleanQuery(queryString); - this.query_include_words = cq[0]; - this.query_exclude_words = cq[1]; - this.query_all_words = cq[2]; - this.query_include_hashes = Word.words2hashesHandles(cq[0]); - this.query_exclude_hashes = Word.words2hashesHandles(cq[1]); - this.query_all_hashes = Word.words2hashesHandles(cq[2]); - } + this.queryGoal = new QueryGoal(queryString); this.ranking = ranking; this.modifier = new Modifier(""); this.maxDistance = Integer.MAX_VALUE; @@ -237,13 +204,7 @@ public final class QueryParams { } public QueryParams( - final String queryString, - final List queryWords, - final List excludeWords, - final List fullqueryWords, - final HandleSet queryHashes, - final HandleSet excludeHashes, - final HandleSet fullqueryHashes, + final QueryGoal queryGoal, final String modifier, final int maxDistance, final String prefer, final ContentDomain contentdom, final String language, @@ -264,14 +225,7 @@ public final class QueryParams { final String userAgent, final boolean filterfailurls, final double lat, final double lon, final double radius) { - - this.queryString = queryString; - this.query_include_words = queryWords; - this.query_exclude_words = excludeWords; - this.query_all_words = fullqueryWords; - this.query_include_hashes = queryHashes; - this.query_exclude_hashes = excludeHashes; - this.query_all_hashes = fullqueryHashes; + this.queryGoal = queryGoal; this.modifier = new Modifier(modifier == null ? "" : modifier); this.ranking = ranking; this.maxDistance = maxDistance; @@ -429,8 +383,8 @@ public final class QueryParams { private final boolean matchesText(final String text) { boolean ret = false; final HandleSet wordhashes = Word.words2hashesHandles(Condenser.getWords(text, null).keySet()); - if (!SetTools.anymatch(wordhashes, this.query_exclude_hashes)) { - ret = SetTools.totalInclusion(this.query_include_hashes, wordhashes); + if (!SetTools.anymatch(wordhashes, this.queryGoal.getExcludeHashes())) { + ret = SetTools.totalInclusion(this.queryGoal.getIncludeHashes(), wordhashes); } return ret; } @@ -443,83 +397,21 @@ public final class QueryParams { return SetTools.anymatch(wordhashes, keyhashes); } - private static String seps = "'.,/&_"; static {seps += '"';} - - @SuppressWarnings("unchecked") - public static List[] cleanQuery(String querystring) { - // returns three sets: a query set, an exclude set and a full query set - final List query_include_words = new ArrayList(); - final List query_exclude_words = new ArrayList(); - final List query_all_words = new ArrayList(); - - if ((querystring != null) && (!querystring.isEmpty())) { - - // convert Umlaute - querystring = AbstractScraper.stripAll(querystring.toCharArray()).toLowerCase().trim(); - int c; - for (int i = 0; i < seps.length(); i++) { - while ((c = querystring.indexOf(seps.charAt(i))) >= 0) { - querystring = querystring.substring(0, c) + (((c + 1) < querystring.length()) ? (" " + querystring.substring(c + 1)) : ""); - } - } - - String s; - int l; - // the string is clean now, but we must generate a set out of it - final String[] queries = querystring.split(" "); - for (String quer : queries) { - if (quer.startsWith("-")) { - String x = quer.substring(1); - if (!query_exclude_words.contains(x)) query_exclude_words.add(x); - } else { - while ((c = quer.indexOf('-')) >= 0) { - s = quer.substring(0, c); - l = s.length(); - if (l >= Condenser.wordminsize && !query_include_words.contains(s)) {query_include_words.add(s);} - if (l > 0 && !query_all_words.contains(s)) {query_all_words.add(s);} - quer = quer.substring(c + 1); - } - l = quer.length(); - if (l >= Condenser.wordminsize && !query_include_words.contains(quer)) {query_include_words.add(quer);} - if (l > 0 && !query_all_words.contains(quer)) {query_all_words.add(quer);} - } - } - } - return new List[]{query_include_words, query_exclude_words, query_all_words}; - } - public String queryString(final boolean encodeHTML) { final String ret; if (encodeHTML){ - ret = CharacterCoding.unicode2html(this.queryString, true); + ret = CharacterCoding.unicode2html(this.queryGoal.getQueryString(), true); } else { - ret = this.queryString; + ret = this.queryGoal.getQueryString(); } return ret; } - private final static YaCySchema[] fields = new YaCySchema[]{ - YaCySchema.sku,YaCySchema.title,YaCySchema.h1_txt,YaCySchema.h2_txt, - YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt - }; - - private final static Map boosts = new LinkedHashMap(); - static { - boosts.put(YaCySchema.sku, 20.0f); - boosts.put(YaCySchema.url_paths_sxt, 20.0f); - boosts.put(YaCySchema.title, 15.0f); - boosts.put(YaCySchema.h1_txt, 11.0f); - boosts.put(YaCySchema.h2_txt, 10.0f); - boosts.put(YaCySchema.author, 8.0f); - boosts.put(YaCySchema.description, 5.0f); - boosts.put(YaCySchema.keywords, 2.0f); - boosts.put(YaCySchema.text_t, 1.0f); - } public SolrQuery solrQuery() { - if (this.query_include_words == null || this.query_include_words.size() == 0) return null; + if (this.queryGoal.getIncludeWords().size() == 0) return null; // get text query - final StringBuilder q = solrQueryString(this.query_include_words, this.query_exclude_words, this.indexSegment.fulltext().getSolrScheme()); + final StringBuilder q = this.queryGoal.solrQueryString(this.indexSegment.fulltext().getSolrScheme()); // add constraints if (this.nav_sitehash == null && this.nav_sitehost == null) { @@ -591,74 +483,11 @@ public final class QueryParams { Log.logInfo("Protocol", "SOLR QUERY: " + params.toString()); return params; } - - public static StringBuilder solrQueryString(List include, List exclude, SolrConfiguration configuration) { - final StringBuilder q = new StringBuilder(80); - - // parse special requests - if (include.size() == 1 && exclude.size() == 0) { - String w = include.get(0); - if (Segment.catchallString.equals(w)) return new StringBuilder("*:*"); - } - - // add text query - int wc = 0; - StringBuilder w = new StringBuilder(80); - for (String s: include) { - if (wc > 0) w.append(" AND "); - w.append(s); - wc++; - } - for (String s: exclude){ - if (wc > 0) w.append(" AND -"); - w.append(s); - wc++; - } - if (wc > 1) {w.insert(0, '('); w.append(')');} - - // combine these queries for all relevant fields - wc = 0; - Float boost; - for (YaCySchema field: fields) { - if (configuration != null && !configuration.contains(field.getSolrFieldName())) continue; - if (wc > 0) q.append(" OR "); - q.append('('); - q.append(field.getSolrFieldName()).append(':').append(w); - boost = boosts.get(field); - if (boost != null) q.append('^').append(boost.toString()); - q.append(')'); - wc++; - } - q.insert(0, '('); - q.append(')'); - - // add filter to prevent that results come from failed urls - q.append(" AND -").append(YaCySchema.failreason_t.getSolrFieldName()).append(":[* TO *]"); - return q; + public QueryGoal getQueryGoal() { + return this.queryGoal; } - public String queryStringForUrl() { - try { - return URLEncoder.encode(this.queryString, "UTF-8"); - } catch (final UnsupportedEncodingException e) { - Log.logException(e); - return this.queryString; - } - } - - public List[] queryWords() { - return cleanQuery(this.queryString); - } - - public void filterOut(final SortedSet blueList) { - // filter out words that appear in this set - // this is applied to the queryHashes - final HandleSet blues = Word.words2hashesHandles(blueList); - for (final byte[] b: blues) this.query_include_hashes.remove(b); - } - - public final Map separateMatches(final Map links) { final Map matcher = new HashMap(); final Iterator > i = links.entrySet().iterator(); @@ -695,13 +524,13 @@ public final class QueryParams { // generate a string that identifies a search so results can be re-used in a cache final StringBuilder context = new StringBuilder(180); if (anonymized) { - context.append(anonymizedQueryHashes(this.query_include_hashes)); + context.append(anonymizedQueryHashes(this.queryGoal.getIncludeHashes())); context.append('-'); - context.append(anonymizedQueryHashes(this.query_exclude_hashes)); + context.append(anonymizedQueryHashes(this.queryGoal.getExcludeHashes())); } else { - context.append(hashSet2hashString(this.query_include_hashes)); + context.append(hashSet2hashString(this.queryGoal.getIncludeHashes())); context.append('-'); - context.append(hashSet2hashString(this.query_exclude_hashes)); + context.append(hashSet2hashString(this.queryGoal.getExcludeHashes())); } //context.append(asterisk); //context.append(this.domType); @@ -755,7 +584,7 @@ public final class QueryParams { sb.append("/yacysearch."); sb.append(ext); sb.append("?query="); - sb.append(newQueryString == null ? theQuery.queryStringForUrl() : newQueryString); + sb.append(newQueryString == null ? theQuery.getQueryGoal().queryStringForUrl() : newQueryString); sb.append(ampersand); sb.append("maximumRecords="); @@ -786,7 +615,7 @@ public final class QueryParams { sb.append(ampersand); sb.append("former="); - sb.append(theQuery.queryStringForUrl()); + sb.append(theQuery.getQueryGoal().queryStringForUrl()); return sb; } diff --git a/source/net/yacy/search/query/RankingProcess.java b/source/net/yacy/search/query/RankingProcess.java index 746181b81..a519be058 100644 --- a/source/net/yacy/search/query/RankingProcess.java +++ b/source/net/yacy/search/query/RankingProcess.java @@ -222,8 +222,8 @@ public final class RankingProcess extends Thread { .getSegment() .termIndex() .query( - this.query.query_include_hashes, - this.query.query_exclude_hashes, + this.query.getQueryGoal().getIncludeHashes(), + this.query.getQueryGoal().getExcludeHashes(), null, Segment.wordReferenceFactory, this.query.maxDistance); @@ -234,7 +234,7 @@ public final class RankingProcess extends Thread { new ProfilingGraph.EventSearch( this.query.id(true), SearchEventType.JOIN, - this.query.queryString, + this.query.getQueryGoal().getQueryString(), index.size(), System.currentTimeMillis() - timer), false); @@ -488,7 +488,7 @@ public final class RankingProcess extends Thread { if ( word.length() > 2 && "http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_zum_der_die_das_und_the_zur_bzw_mit_blog_wiki_aus_bei_off" .indexOf(word) < 0 - && !this.query.query_include_hashes.has(Word.word2hash(word)) + && !this.query.getQueryGoal().getIncludeHashes().has(Word.word2hash(word)) && lettermatch.matcher(word).matches() && !Switchboard.badwords.contains(word) && !Switchboard.stopwords.contains(word) ) { diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index c2a5d39f3..60a85b3cd 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -151,7 +151,7 @@ public final class SearchEvent { this.snippets = new ConcurrentHashMap(); this.secondarySearchSuperviser = - (this.query.query_include_hashes.size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches + (this.query.getQueryGoal().getIncludeHashes().size() > 1) ? new SecondarySearchSuperviser(this) : null; // generate abstracts only for combined searches if ( this.secondarySearchSuperviser != null ) { this.secondarySearchSuperviser.start(); } @@ -184,7 +184,7 @@ public final class SearchEvent { if (this.remote) { // start global searches final long timer = System.currentTimeMillis(); - if (this.query.query_include_hashes.isEmpty()) { + if (this.query.getQueryGoal().getIncludeHashes().isEmpty()) { this.primarySearchThreadsL = null; } else { this.primarySearchThreadsL = new ArrayList(); @@ -281,12 +281,12 @@ public final class SearchEvent { // only with the query minus the stopwords which had not been used for the search HandleSet filtered; try { - filtered = RowHandleSet.joinConstructive(query.query_include_hashes, Switchboard.stopwordHashes); + filtered = RowHandleSet.joinConstructive(query.getQueryGoal().getIncludeHashes(), Switchboard.stopwordHashes); } catch (final SpaceExceededException e) { Log.logException(e); - filtered = new RowHandleSet(query.query_include_hashes.keylen(), query.query_include_hashes.comparator(), 0); + filtered = new RowHandleSet(query.getQueryGoal().getIncludeHashes().keylen(), query.getQueryGoal().getIncludeHashes().comparator(), 0); } - this.snippetFetchWordHashes = query.query_include_hashes.clone(); + this.snippetFetchWordHashes = query.getQueryGoal().getIncludeHashes().clone(); if (filtered != null && !filtered.isEmpty()) { this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes); } @@ -763,17 +763,17 @@ public final class SearchEvent { final String pagetitle = page.dc_title().toLowerCase(); // check exclusion - if ( this.query.query_exclude_hashes != null && !this.query.query_exclude_hashes.isEmpty() && - ((QueryParams.anymatch(pagetitle, this.query.query_exclude_hashes)) - || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.query_exclude_hashes)) - || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.query_exclude_hashes)))) { + if ( !this.query.getQueryGoal().getExcludeHashes().isEmpty() && + ((QueryParams.anymatch(pagetitle, this.query.getQueryGoal().getExcludeHashes())) + || (QueryParams.anymatch(pageurl.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())) + || (QueryParams.anymatch(pageauthor.toLowerCase(), this.query.getQueryGoal().getExcludeHashes())))) { this.query.misses.add(page.hash()); continue; } // check index-of constraint if ((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof)) && (!(pagetitle.startsWith("index of")))) { - final Iterator wi = this.query.query_include_hashes.iterator(); + final Iterator wi = this.query.getQueryGoal().getIncludeHashes().iterator(); while ( wi.hasNext() ) { this.query.getSegment().termIndex().removeDelayed(wi.next(), page.hash()); } diff --git a/source/net/yacy/search/query/SecondarySearchSuperviser.java b/source/net/yacy/search/query/SecondarySearchSuperviser.java index 1e651fb01..af613faf2 100644 --- a/source/net/yacy/search/query/SecondarySearchSuperviser.java +++ b/source/net/yacy/search/query/SecondarySearchSuperviser.java @@ -109,7 +109,7 @@ public class SecondarySearchSuperviser extends Thread { } private void prepareSecondarySearch() { - if ( this.abstractsCache == null || this.abstractsCache.size() != this.searchEvent.query.query_include_hashes.size() ) { + if ( this.abstractsCache == null || this.abstractsCache.size() != this.searchEvent.query.getQueryGoal().getIncludeHashes().size() ) { return; // secondary search not possible (yet) } @@ -122,7 +122,7 @@ public class SecondarySearchSuperviser extends Thread { */ // find out if there are enough references for all words that are searched - if ( this.abstractsCache.size() != this.searchEvent.query.query_include_hashes.size() ) { + if ( this.abstractsCache.size() != this.searchEvent.query.getQueryGoal().getIncludeHashes().size() ) { return; } diff --git a/source/net/yacy/search/query/SnippetWorker.java b/source/net/yacy/search/query/SnippetWorker.java index fe4b102d1..a759689d9 100644 --- a/source/net/yacy/search/query/SnippetWorker.java +++ b/source/net/yacy/search/query/SnippetWorker.java @@ -179,7 +179,7 @@ public class SnippetWorker extends Thread { // apply query-in-result matching final HandleSet urlcomph = Word.words2hashesHandles(urlcomps); final HandleSet descrcomph = Word.words2hashesHandles(descrcomps); - final Iterator shi = this.snippetProcess.query.query_include_hashes.iterator(); + final Iterator shi = this.snippetProcess.query.getQueryGoal().getIncludeHashes().iterator(); byte[] queryhash; while (shi.hasNext()) { queryhash = shi.next(); @@ -252,7 +252,7 @@ public class SnippetWorker extends Thread { } final String reason = "no text snippet; errorCode = " + snippet.getErrorCode(); if (this.snippetProcess.deleteIfSnippetFail) { - this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.query_include_hashes, reason); + this.snippetProcess.workTables.failURLsRegisterMissingWord(this.snippetProcess.query.getSegment().termIndex(), page.url(), this.snippetProcess.query.getQueryGoal().getIncludeHashes(), reason); } SearchEvent.log.logInfo("sorted out url " + page.url().toNormalform(true) + " during search: " + reason); return null; diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 8c1899941..ab1967a47 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -56,6 +56,7 @@ import net.yacy.peers.RemoteSearch; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; +import net.yacy.search.query.QueryGoal; public class TextSnippet implements Comparable, Comparator { @@ -380,7 +381,8 @@ public class TextSnippet implements Comparable, Comparator