From d44d8996d03ecec0e3c78fb54ab39ae22caef7c1 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Jan 2015 11:10:45 +0100 Subject: [PATCH] =?UTF-8?q?Added=20a=20=E2=80=9Cdon't=20store=20remote=20s?= =?UTF-8?q?earch=20results=E2=80=9D=20option=20This=20is=20intended=20for?= =?UTF-8?q?=20peers=20who=20want=20to=20participate=20in=20the=20P2P=20net?= =?UTF-8?q?work=20but=20don't=20wish=20to=20load/fill-up=20their=20index?= =?UTF-8?q?=20with=20metadata=20of=20every=20received=20search=20result.?= =?UTF-8?q?=20The=20DHT=20transfer=20is=20not=20effected=20by=20this=20opt?= =?UTF-8?q?ion=20(and=20will=20work=20as=20usual,=20so=20that=20a=20peer?= =?UTF-8?q?=20disabling=20the=20new=20store=20to=20index=20switch=20still?= =?UTF-8?q?=20receives=20and=20holds=20the=20metadata=20according=20to=20D?= =?UTF-8?q?HT=20rules).=20Downside=20for=20the=20local=20peer=20is=20that?= =?UTF-8?q?=20search=20speed=20will=20not=20improve=20if=20search=20terms?= =?UTF-8?q?=20are=20only=20avail.=20remote=20or=20by=20quick=20hits=20in?= =?UTF-8?q?=20local=20index.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit To be able to improve the local index a Click-Servlet option was added additionally. If switched on, all search result links point to this servlet, which forwards the users browser (by html header) to the desired page and feeds the page to the fulltext-index. The servlet accepts a parameter defining the action to perform (see defaults/web.xml, index, crawl, crawllinks) The option check-boxes are placed in ConfigPortal.html --- defaults/web.xml | 16 ++ defaults/yacy.init | 3 + htroot/ConfigPortal.html | 26 +++- htroot/ConfigPortal.java | 10 ++ htroot/yacysearchitem.java | 2 + .../net/yacy/http/servlets/ClickServlet.java | 144 ++++++++++++++++++ source/net/yacy/peers/Protocol.java | 61 +++++--- .../net/yacy/search/SwitchboardConstants.java | 4 + source/net/yacy/search/query/SearchEvent.java | 5 +- .../yacy/search/query/SearchEventCache.java | 3 +- 10 files changed, 248 insertions(+), 26 deletions(-) create mode 100644 source/net/yacy/http/servlets/ClickServlet.java diff --git a/defaults/web.xml b/defaults/web.xml index a13e090aa..91d8323da 100644 --- a/defaults/web.xml +++ b/defaults/web.xml @@ -55,7 +55,23 @@ YaCy stop proxy + + ClickServlet + net.yacy.http.servlets.ClickServlet + + clickaction + index + defines the action to perform with supplied url + + + + + + + ClickServlet + /click + SolrSelectServlet diff --git a/defaults/yacy.init b/defaults/yacy.init index 14d1b426d..626eadf6a 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -826,6 +826,8 @@ search.result.show.proxy = false search.result.show.hostbrowser = true search.result.show.vocabulary = false +search.result.useclickservlet = true + # search navigators: comma-separated list of default values for search navigation. # can be temporary different if search string is given with differen navigation values # assigning no value(s) means that no navigation is shown @@ -857,6 +859,7 @@ search.verify.delete = true # remote search details remotesearch.maxcount = 10 remotesearch.maxtime = 3000 +remotesearch.result.store=true # specifies if yacy should set it's own referer if no referer URL # was set by the client. diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index c5236e50a..b2f7bcd0a 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -51,19 +51,31 @@
Snippet Fetch Strategy & Link Verification
ideaSpeed up search results with this option! (use CACHEONLY or FALSE to switch off verification)
- NOCACHE: no use of web cache, load all snippets online
- IFFRESH: use the cache if the cache exists and is fresh otherwise load online
- IFEXIST: use the cache if the cache exist or load online
- If verification fails, delete index reference

- CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet
- FALSE: no link verification and not snippet generation: all search results are valid without verification + NOCACHE: no use of web cache, load all snippets online
+ IFFRESH: use the cache if the cache exists and is fresh otherwise load online
+ IFEXIST: use the cache if the cache exist or load online
+ If verification fails, delete index reference

+ CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet
+ FALSE: no link verification and not snippet generation: all search results are valid without verification
Greedy Learning Mode
- load documents linked in search results, will be deactivated automatically when index size > #[greedylearning.limit.doccount]# (see Heuristics: search-result to use this permanent) + load documents linked in search results, will be deactivated automatically when index size > #[greedylearning.limit.doccount]# (see Heuristics: search-result to use this permanent)
+
Index remote results
+
+ add remote search results to the local index ( default=on, it is recommended to enable this option ! ) +
+ + #(remotesearch.result.store)# +
Use Click-Servlet for search result links
+
+ allows YaCy to perform some actions if user clicks on a search result (by default add the clicked link to the index) +
+ ::#(/remotesearch.result.store)# +
Default Pop-Up Page
Status Page  diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index e450b65a7..2c618c730 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -90,6 +90,11 @@ public class ConfigPortal { sb.setConfig("search.options", post.getBoolean("search.options")); sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, post.getBoolean(SwitchboardConstants.GREEDYLEARNING_ACTIVE)); + + final boolean storeresult = post.getBoolean(SwitchboardConstants.REMOTESEARCH_RESULT_STORE); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, storeresult); + // click servlet only meaningful if result not stored (currently) + sb.setConfig(SwitchboardConstants.SEARCH_USECLICKSERVLET, !storeresult && post.getBoolean(SwitchboardConstants.SEARCH_USECLICKSERVLET)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete")); @@ -144,6 +149,8 @@ public class ConfigPortal { sb.setConfig("search.navigation", config.getProperty("search.navigation","hosts,authors,namespace,topics")); sb.setConfig("search.options", config.getProperty("search.options","true")); sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, config.getProperty(SwitchboardConstants.GREEDYLEARNING_ACTIVE)); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE)); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE+"_"+SwitchboardConstants.SEARCH_USECLICKSERVLET, config.getProperty(SwitchboardConstants.SEARCH_USECLICKSERVLET)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, config.getProperty(SwitchboardConstants.SEARCH_VERIFY,"iffresh")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, config.getProperty(SwitchboardConstants.SEARCH_VERIFY_DELETE,"true")); sb.setConfig("about.headline", config.getProperty("about.headline","")); @@ -165,6 +172,9 @@ public class ConfigPortal { prop.put(SwitchboardConstants.GREEDYLEARNING_ACTIVE, sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) ? 1 : 0); prop.put(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, sb.getConfig(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, "0")); + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, sb.getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true) ? 1 : 0); + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE+"_"+SwitchboardConstants.SEARCH_USECLICKSERVLET, sb.getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false) ? 1 : 0); + prop.put("search.navigation.hosts", sb.getConfig("search.navigation", "").indexOf("hosts",0) >= 0 ? 1 : 0); prop.put("search.navigation.authors", sb.getConfig("search.navigation", "").indexOf("authors",0) >= 0 ? 1 : 0); prop.put("search.navigation.collections", sb.getConfig("search.navigation", "").indexOf("collections",0) >= 0 ? 1 : 0); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index eff60c80f..9ac02ddc4 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -181,6 +181,8 @@ public class yacysearchitem { } } prop.putXML("content_link", modifyURL); // putXML for rss + } else if (sb.getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false)) { + prop.putXML("content_link", "click?url="+resultUrlstring); // putXML for rss } else { prop.putXML("content_link", resultUrlstring); // putXML for rss } diff --git a/source/net/yacy/http/servlets/ClickServlet.java b/source/net/yacy/http/servlets/ClickServlet.java new file mode 100644 index 000000000..5e4d40e82 --- /dev/null +++ b/source/net/yacy/http/servlets/ClickServlet.java @@ -0,0 +1,144 @@ +/** + * ClickServlet Copyright 2014 by Michael Peter Christen First released + * 25.12.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.http.servlets; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; + +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; + +/** + * The ClickServlet is used as search result link to perform additional actions + * upon click on the link by user. The actual target url is given as parameter, + * the servlet forwards the user to the target link page and performs additonal + * actions with the target url (basically alternative of using javascript + * href.onClick() ) + * + * Request Parameter: url= the target User browser is forwarded to the url using + * html header or javascript afterwards performs configured actions, + * + * Actions e.g. (0- = not implemented yet) + * - crawl/recrawl the url + * - crawl all links on page (with depth) / site + * 0- increase/create rating + * 0- add to a collection + * 0- connect query and url + * 0- learn and classify content - promote rating + * 0- add to click statistic url/cnt (maybe to use for boost) + */ +public class ClickServlet extends HttpServlet { + + private static final long serialVersionUID = 1L; + + // config switches to remember actions to perform + String _actionCode = "index"; + + static final String crawlaction = "crawl"; // actionCode to add url to crawler with crawldepth=0 + static final String indexaction = "index"; // actionCode to add url to index (=default) + static final String crawllinksaction = "crawllinks"; // actionCode to add url to crawler with crawldepth=1 + + @Override + public void init() { + if (this.getInitParameter("clickaction") != null) { + _actionCode = this.getInitParameter("clickaction"); + } + } + + @Override + public void service(ServletRequest request, ServletResponse response) throws IOException, ServletException { + + HttpServletRequest hrequest = (HttpServletRequest) request; + HttpServletResponse hresponse = (HttpServletResponse) response; + + final String strUrl = hrequest.getParameter("url"); + if (strUrl == null) { + hresponse.sendError(HttpServletResponse.SC_NOT_FOUND, "url parameter missing"); + return; + } + + try { + hresponse.setStatus(HttpServletResponse.SC_OK); + /* alternative to use javascript / http-equiv header + hresponse.setStatus(HttpServletResponse.SC_TEMPORARY_REDIRECT); + hresponse.setHeader(HeaderFramework.LOCATION, strUrl); + */ + + // output html forward to url header + PrintWriter pw = response.getWriter(); + response.setContentType("text/html"); + + pw.println(""); + pw.println(""); + + pw.print(""); + + pw.print(""); + + pw.println(""); + pw.close(); + + if (Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false)) { + + // do click event action + if (_actionCode != null) { + switch (_actionCode) { + case crawlaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + Switchboard.getSwitchboard().addToCrawler(urls, false); + break; + } + case indexaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + + Switchboard.getSwitchboard().addToIndex(urls, null, null, null, true); + break; + } + case crawllinksaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + Switchboard.getSwitchboard().addToCrawler(urls, false); + Switchboard.getSwitchboard().heuristicSearchResults(strUrl); + break; + } + } + } + } + } catch (Exception e) { + ConcurrentLog.logException(e); + } + } + +} diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 012fa969b..15915dbb4 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -652,7 +652,13 @@ public final class Protocol { // insert results to containers int term = count; - Collection storeDocs = new ArrayList(result.links.size()); + Map> snip; + if (event.addResultsToLocalIndex) { + snip = null; + } else { + snip = new HashMap>(); // needed to display nodestack results + } + List storeDocs = new ArrayList(result.links.size()); for ( final URIMetadataNode urlEntry : result.links ) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) @@ -713,6 +719,13 @@ public final class Protocol { // instead, they are placed in a snipped-search cache. // System.out.println("--- RECEIVED SNIPPET '" + urlEntry.snippet() + "'"); TextSnippet.snippetsCache.put(wordhashes, ASCII.String(urlEntry.hash()), urlEntry.snippet()); + // add snippet for snippethandling for nodestack entries (used if not stored to index) + if (!event.addResultsToLocalIndex) { + // TODO: must have a snippet even to get the snippetcache entry back when adding to nodestack + LinkedHashSet sniptxt = new LinkedHashSet(); + sniptxt.add(urlEntry.snippet()); + snip.put(ASCII.String(urlEntry.hash()), sniptxt); + } } // add the url entry to the word indexes @@ -725,19 +738,25 @@ public final class Protocol { } } } - - for (URIMetadataNode entry: storeDocs) { - try { - event.query.getSegment().fulltext().putMetadata(entry); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - } // store remote result to local result container // insert one container into the search result buffer // one is enough, only the references are used, not the word - event.addRWIs(container.get(0), false, target.getName() + "/" + target.hash, result.totalCount, time); + if (event.addResultsToLocalIndex) { + for (URIMetadataNode entry : storeDocs) { + try { + event.query.getSegment().fulltext().putMetadata(entry); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + } + event.addRWIs(container.get(0), false, target.getName() + "/" + target.hash, result.totalCount, time); + } else { + // feed results as nodes (SolrQuery results) which carry metadata, + // to prevent a call to getMetaData for RWI results, which would fail (if no metadata in index and no display of these results) + Map> facets = new HashMap>(); + event.addNodes(storeDocs, facets, snip, false, target.getName() + "/" + target.hash, count); + } event.addFinalize(); event.addExpectedRemoteReferences(-count); @@ -1053,7 +1072,10 @@ public final class Protocol { List container = new ArrayList(); Network.log.info("SEARCH (solr), returned " + docList[0].size() + " out of " + docList[0].getNumFound() + " documents and " + facets.size() + " facets " + facets.keySet().toString() + " from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))); int term = count; - Collection docs = new ArrayList(docList[0].size()); + Collection docs; + if (event.addResultsToLocalIndex) { // only needed to store remote results + docs = new ArrayList(docList[0].size()); + } else docs = null; for (final SolrDocument doc: docList[0]) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) @@ -1092,15 +1114,18 @@ public final class Protocol { event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); // put the remote documents to the local index. We must convert the solr document to a solr input document: - SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + if (event.addResultsToLocalIndex) { + final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + + // the input document stays untouched because it contains top-level cloned objects + if (event.addResultsToLocalIndex) docs.add(sid); + } // after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document // because that goes into a search cache and would take a lot of memory in the search cache //doc.removeFields(CollectionSchema.text_t.getSolrFieldName()); doc.removeFields(CollectionSchema.synonyms_sxt.getSolrFieldName()); - // the input document stays untouched because it contains top-level cloned objects - docs.add(sid); ResultURLs.stack( ASCII.String(urlEntry.url().hash()), urlEntry.url().getHost(), @@ -1122,10 +1147,12 @@ public final class Protocol { event.addExpectedRemoteReferences(-count); Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + numFound + " references"); } else { - for (SolrInputDocument doc: docs) { - event.query.getSegment().putDocument(doc); + if (event.addResultsToLocalIndex) { + for (SolrInputDocument doc: docs) { + event.query.getSegment().putDocument(doc); + } + docs.clear(); docs = null; } - docs.clear(); docs = null; event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index c90eeec96..26b476aa1 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -331,6 +331,8 @@ public final class SwitchboardConstants { public static final String REMOTESEARCH_MAXCOUNT_USER = "remotesearch.maxcount"; public static final String REMOTESEARCH_MAXTIME_USER = "remotesearch.maxtime"; + public static final String REMOTESEARCH_RESULT_STORE = "remotesearch.result.store"; // add remote results to local index + public static final String FEDERATED_SERVICE_SOLR_INDEXING_ENABLED = "federated.service.solr.indexing.enabled"; public static final String FEDERATED_SERVICE_SOLR_INDEXING_URL = "federated.service.solr.indexing.url"; public static final String FEDERATED_SERVICE_SOLR_INDEXING_SHARDING = "federated.service.solr.indexing.sharding"; @@ -524,6 +526,8 @@ public final class SwitchboardConstants { public static final String SEARCH_VERIFY = "search.verify"; public static final String SEARCH_VERIFY_DELETE = "search.verify.delete"; + public static final String SEARCH_USECLICKSERVLET = "search.result.useclickservlet"; // resultlink via click servlet + /** * ranking+evaluation */ diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 6fcf8d378..4c7a2f2f4 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -158,6 +158,7 @@ public final class SearchEvent { private long snippetComputationAllTime; private ConcurrentHashMap> snippets; private final boolean remote; + public final boolean addResultsToLocalIndex; // add received results to local index (defult=true) private SortedMap> localSearchInclusion; private final ScoreMap ref; // reference score computation for the commonSense heuristic private final long maxtime; @@ -204,7 +205,8 @@ public final class SearchEvent { final LoaderDispatcher loader, final int remote_maxcount, final long remote_maxtime, - final boolean deleteIfSnippetFail) { + final boolean deleteIfSnippetFail, + final boolean addResultsToLocalIdx) { long ab = MemoryControl.available(); if (ab < 1024 * 1024 * 200) { @@ -255,6 +257,7 @@ public final class SearchEvent { this.IAmaxcounthash = null; this.IAneardhthash = null; this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false))); + this.addResultsToLocalIndex = addResultsToLocalIdx; this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering this.local_rwi_stored = new AtomicInteger(0); this.local_solr_available = new AtomicInteger(0); diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java index 9458bca07..bdae26b28 100644 --- a/source/net/yacy/search/query/SearchEventCache.java +++ b/source/net/yacy/search/query/SearchEventCache.java @@ -171,7 +171,8 @@ public class SearchEventCache { // start a new event Switchboard sb = Switchboard.getSwitchboard(); final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true); - event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete); + final boolean addToLocalIdx = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true); + event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete, addToLocalIdx); MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads }