diff --git a/defaults/web.xml b/defaults/web.xml index a13e090aa..91d8323da 100644 --- a/defaults/web.xml +++ b/defaults/web.xml @@ -55,7 +55,23 @@ YaCy stop proxy + + ClickServlet + net.yacy.http.servlets.ClickServlet + + clickaction + index + defines the action to perform with supplied url + + + + + + + ClickServlet + /click + SolrSelectServlet diff --git a/defaults/yacy.init b/defaults/yacy.init index 14d1b426d..626eadf6a 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -826,6 +826,8 @@ search.result.show.proxy = false search.result.show.hostbrowser = true search.result.show.vocabulary = false +search.result.useclickservlet = true + # search navigators: comma-separated list of default values for search navigation. # can be temporary different if search string is given with differen navigation values # assigning no value(s) means that no navigation is shown @@ -857,6 +859,7 @@ search.verify.delete = true # remote search details remotesearch.maxcount = 10 remotesearch.maxtime = 3000 +remotesearch.result.store=true # specifies if yacy should set it's own referer if no referer URL # was set by the client. diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index c5236e50a..b2f7bcd0a 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -51,19 +51,31 @@
Snippet Fetch Strategy & Link Verification
ideaSpeed up search results with this option! (use CACHEONLY or FALSE to switch off verification)
- NOCACHE: no use of web cache, load all snippets online
- IFFRESH: use the cache if the cache exists and is fresh otherwise load online
- IFEXIST: use the cache if the cache exist or load online
- If verification fails, delete index reference

- CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet
- FALSE: no link verification and not snippet generation: all search results are valid without verification + NOCACHE: no use of web cache, load all snippets online
+ IFFRESH: use the cache if the cache exists and is fresh otherwise load online
+ IFEXIST: use the cache if the cache exist or load online
+ If verification fails, delete index reference

+ CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet
+ FALSE: no link verification and not snippet generation: all search results are valid without verification
Greedy Learning Mode
- load documents linked in search results, will be deactivated automatically when index size > #[greedylearning.limit.doccount]# (see Heuristics: search-result to use this permanent) + load documents linked in search results, will be deactivated automatically when index size > #[greedylearning.limit.doccount]# (see Heuristics: search-result to use this permanent)
+
Index remote results
+
+ add remote search results to the local index ( default=on, it is recommended to enable this option ! ) +
+ + #(remotesearch.result.store)# +
Use Click-Servlet for search result links
+
+ allows YaCy to perform some actions if user clicks on a search result (by default add the clicked link to the index) +
+ ::#(/remotesearch.result.store)# +
Default Pop-Up Page
Status Page  diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index e450b65a7..2c618c730 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -90,6 +90,11 @@ public class ConfigPortal { sb.setConfig("search.options", post.getBoolean("search.options")); sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, post.getBoolean(SwitchboardConstants.GREEDYLEARNING_ACTIVE)); + + final boolean storeresult = post.getBoolean(SwitchboardConstants.REMOTESEARCH_RESULT_STORE); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, storeresult); + // click servlet only meaningful if result not stored (currently) + sb.setConfig(SwitchboardConstants.SEARCH_USECLICKSERVLET, !storeresult && post.getBoolean(SwitchboardConstants.SEARCH_USECLICKSERVLET)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete")); @@ -144,6 +149,8 @@ public class ConfigPortal { sb.setConfig("search.navigation", config.getProperty("search.navigation","hosts,authors,namespace,topics")); sb.setConfig("search.options", config.getProperty("search.options","true")); sb.setConfig(SwitchboardConstants.GREEDYLEARNING_ACTIVE, config.getProperty(SwitchboardConstants.GREEDYLEARNING_ACTIVE)); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, config.getProperty(SwitchboardConstants.REMOTESEARCH_RESULT_STORE)); + sb.setConfig(SwitchboardConstants.REMOTESEARCH_RESULT_STORE+"_"+SwitchboardConstants.SEARCH_USECLICKSERVLET, config.getProperty(SwitchboardConstants.SEARCH_USECLICKSERVLET)); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, config.getProperty(SwitchboardConstants.SEARCH_VERIFY,"iffresh")); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, config.getProperty(SwitchboardConstants.SEARCH_VERIFY_DELETE,"true")); sb.setConfig("about.headline", config.getProperty("about.headline","")); @@ -165,6 +172,9 @@ public class ConfigPortal { prop.put(SwitchboardConstants.GREEDYLEARNING_ACTIVE, sb.getConfigBool(SwitchboardConstants.GREEDYLEARNING_ACTIVE, false) ? 1 : 0); prop.put(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, sb.getConfig(SwitchboardConstants.GREEDYLEARNING_LIMIT_DOCCOUNT, "0")); + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, sb.getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true) ? 1 : 0); + prop.put(SwitchboardConstants.REMOTESEARCH_RESULT_STORE+"_"+SwitchboardConstants.SEARCH_USECLICKSERVLET, sb.getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false) ? 1 : 0); + prop.put("search.navigation.hosts", sb.getConfig("search.navigation", "").indexOf("hosts",0) >= 0 ? 1 : 0); prop.put("search.navigation.authors", sb.getConfig("search.navigation", "").indexOf("authors",0) >= 0 ? 1 : 0); prop.put("search.navigation.collections", sb.getConfig("search.navigation", "").indexOf("collections",0) >= 0 ? 1 : 0); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index eff60c80f..9ac02ddc4 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -181,6 +181,8 @@ public class yacysearchitem { } } prop.putXML("content_link", modifyURL); // putXML for rss + } else if (sb.getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false)) { + prop.putXML("content_link", "click?url="+resultUrlstring); // putXML for rss } else { prop.putXML("content_link", resultUrlstring); // putXML for rss } diff --git a/source/net/yacy/http/servlets/ClickServlet.java b/source/net/yacy/http/servlets/ClickServlet.java new file mode 100644 index 000000000..5e4d40e82 --- /dev/null +++ b/source/net/yacy/http/servlets/ClickServlet.java @@ -0,0 +1,144 @@ +/** + * ClickServlet Copyright 2014 by Michael Peter Christen First released + * 25.12.2014 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.http.servlets; + +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; + +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; + +/** + * The ClickServlet is used as search result link to perform additional actions + * upon click on the link by user. The actual target url is given as parameter, + * the servlet forwards the user to the target link page and performs additonal + * actions with the target url (basically alternative of using javascript + * href.onClick() ) + * + * Request Parameter: url= the target User browser is forwarded to the url using + * html header or javascript afterwards performs configured actions, + * + * Actions e.g. (0- = not implemented yet) + * - crawl/recrawl the url + * - crawl all links on page (with depth) / site + * 0- increase/create rating + * 0- add to a collection + * 0- connect query and url + * 0- learn and classify content - promote rating + * 0- add to click statistic url/cnt (maybe to use for boost) + */ +public class ClickServlet extends HttpServlet { + + private static final long serialVersionUID = 1L; + + // config switches to remember actions to perform + String _actionCode = "index"; + + static final String crawlaction = "crawl"; // actionCode to add url to crawler with crawldepth=0 + static final String indexaction = "index"; // actionCode to add url to index (=default) + static final String crawllinksaction = "crawllinks"; // actionCode to add url to crawler with crawldepth=1 + + @Override + public void init() { + if (this.getInitParameter("clickaction") != null) { + _actionCode = this.getInitParameter("clickaction"); + } + } + + @Override + public void service(ServletRequest request, ServletResponse response) throws IOException, ServletException { + + HttpServletRequest hrequest = (HttpServletRequest) request; + HttpServletResponse hresponse = (HttpServletResponse) response; + + final String strUrl = hrequest.getParameter("url"); + if (strUrl == null) { + hresponse.sendError(HttpServletResponse.SC_NOT_FOUND, "url parameter missing"); + return; + } + + try { + hresponse.setStatus(HttpServletResponse.SC_OK); + /* alternative to use javascript / http-equiv header + hresponse.setStatus(HttpServletResponse.SC_TEMPORARY_REDIRECT); + hresponse.setHeader(HeaderFramework.LOCATION, strUrl); + */ + + // output html forward to url header + PrintWriter pw = response.getWriter(); + response.setContentType("text/html"); + + pw.println(""); + pw.println(""); + + pw.print(""); + + pw.print(""); + + pw.println(""); + pw.close(); + + if (Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_USECLICKSERVLET, false)) { + + // do click event action + if (_actionCode != null) { + switch (_actionCode) { + case crawlaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + Switchboard.getSwitchboard().addToCrawler(urls, false); + break; + } + case indexaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + + Switchboard.getSwitchboard().addToIndex(urls, null, null, null, true); + break; + } + case crawllinksaction: { + final Collection urls = new ArrayList(); + urls.add(new DigestURL(strUrl)); + Switchboard.getSwitchboard().addToCrawler(urls, false); + Switchboard.getSwitchboard().heuristicSearchResults(strUrl); + break; + } + } + } + } + } catch (Exception e) { + ConcurrentLog.logException(e); + } + } + +} diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 012fa969b..15915dbb4 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -652,7 +652,13 @@ public final class Protocol { // insert results to containers int term = count; - Collection storeDocs = new ArrayList(result.links.size()); + Map> snip; + if (event.addResultsToLocalIndex) { + snip = null; + } else { + snip = new HashMap>(); // needed to display nodestack results + } + List storeDocs = new ArrayList(result.links.size()); for ( final URIMetadataNode urlEntry : result.links ) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) @@ -713,6 +719,13 @@ public final class Protocol { // instead, they are placed in a snipped-search cache. // System.out.println("--- RECEIVED SNIPPET '" + urlEntry.snippet() + "'"); TextSnippet.snippetsCache.put(wordhashes, ASCII.String(urlEntry.hash()), urlEntry.snippet()); + // add snippet for snippethandling for nodestack entries (used if not stored to index) + if (!event.addResultsToLocalIndex) { + // TODO: must have a snippet even to get the snippetcache entry back when adding to nodestack + LinkedHashSet sniptxt = new LinkedHashSet(); + sniptxt.add(urlEntry.snippet()); + snip.put(ASCII.String(urlEntry.hash()), sniptxt); + } } // add the url entry to the word indexes @@ -725,19 +738,25 @@ public final class Protocol { } } } - - for (URIMetadataNode entry: storeDocs) { - try { - event.query.getSegment().fulltext().putMetadata(entry); - } catch (final IOException e) { - ConcurrentLog.logException(e); - } - } // store remote result to local result container // insert one container into the search result buffer // one is enough, only the references are used, not the word - event.addRWIs(container.get(0), false, target.getName() + "/" + target.hash, result.totalCount, time); + if (event.addResultsToLocalIndex) { + for (URIMetadataNode entry : storeDocs) { + try { + event.query.getSegment().fulltext().putMetadata(entry); + } catch (final IOException e) { + ConcurrentLog.logException(e); + } + } + event.addRWIs(container.get(0), false, target.getName() + "/" + target.hash, result.totalCount, time); + } else { + // feed results as nodes (SolrQuery results) which carry metadata, + // to prevent a call to getMetaData for RWI results, which would fail (if no metadata in index and no display of these results) + Map> facets = new HashMap>(); + event.addNodes(storeDocs, facets, snip, false, target.getName() + "/" + target.hash, count); + } event.addFinalize(); event.addExpectedRemoteReferences(-count); @@ -1053,7 +1072,10 @@ public final class Protocol { List container = new ArrayList(); Network.log.info("SEARCH (solr), returned " + docList[0].size() + " out of " + docList[0].getNumFound() + " documents and " + facets.size() + " facets " + facets.keySet().toString() + " from " + (target == null ? "shard" : ("peer " + target.hash + ":" + target.getName()))); int term = count; - Collection docs = new ArrayList(docList[0].size()); + Collection docs; + if (event.addResultsToLocalIndex) { // only needed to store remote results + docs = new ArrayList(docList[0].size()); + } else docs = null; for (final SolrDocument doc: docList[0]) { if ( term-- <= 0 ) { break; // do not process more that requested (in case that evil peers fill us up with rubbish) @@ -1092,15 +1114,18 @@ public final class Protocol { event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); // put the remote documents to the local index. We must convert the solr document to a solr input document: - SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + if (event.addResultsToLocalIndex) { + final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); + + // the input document stays untouched because it contains top-level cloned objects + if (event.addResultsToLocalIndex) docs.add(sid); + } // after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document // because that goes into a search cache and would take a lot of memory in the search cache //doc.removeFields(CollectionSchema.text_t.getSolrFieldName()); doc.removeFields(CollectionSchema.synonyms_sxt.getSolrFieldName()); - // the input document stays untouched because it contains top-level cloned objects - docs.add(sid); ResultURLs.stack( ASCII.String(urlEntry.url().hash()), urlEntry.url().getHost(), @@ -1122,10 +1147,12 @@ public final class Protocol { event.addExpectedRemoteReferences(-count); Network.log.info("local search (solr): localpeer sent " + container.size() + "/" + numFound + " references"); } else { - for (SolrInputDocument doc: docs) { - event.query.getSegment().putDocument(doc); + if (event.addResultsToLocalIndex) { + for (SolrInputDocument doc: docs) { + event.query.getSegment().putDocument(doc); + } + docs.clear(); docs = null; } - docs.clear(); docs = null; event.addNodes(container, facets, snippets, false, target.getName() + "/" + target.hash, numFound); event.addFinalize(); event.addExpectedRemoteReferences(-count); diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index c90eeec96..26b476aa1 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -331,6 +331,8 @@ public final class SwitchboardConstants { public static final String REMOTESEARCH_MAXCOUNT_USER = "remotesearch.maxcount"; public static final String REMOTESEARCH_MAXTIME_USER = "remotesearch.maxtime"; + public static final String REMOTESEARCH_RESULT_STORE = "remotesearch.result.store"; // add remote results to local index + public static final String FEDERATED_SERVICE_SOLR_INDEXING_ENABLED = "federated.service.solr.indexing.enabled"; public static final String FEDERATED_SERVICE_SOLR_INDEXING_URL = "federated.service.solr.indexing.url"; public static final String FEDERATED_SERVICE_SOLR_INDEXING_SHARDING = "federated.service.solr.indexing.sharding"; @@ -524,6 +526,8 @@ public final class SwitchboardConstants { public static final String SEARCH_VERIFY = "search.verify"; public static final String SEARCH_VERIFY_DELETE = "search.verify.delete"; + public static final String SEARCH_USECLICKSERVLET = "search.result.useclickservlet"; // resultlink via click servlet + /** * ranking+evaluation */ diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 6fcf8d378..4c7a2f2f4 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -158,6 +158,7 @@ public final class SearchEvent { private long snippetComputationAllTime; private ConcurrentHashMap> snippets; private final boolean remote; + public final boolean addResultsToLocalIndex; // add received results to local index (defult=true) private SortedMap> localSearchInclusion; private final ScoreMap ref; // reference score computation for the commonSense heuristic private final long maxtime; @@ -204,7 +205,8 @@ public final class SearchEvent { final LoaderDispatcher loader, final int remote_maxcount, final long remote_maxtime, - final boolean deleteIfSnippetFail) { + final boolean deleteIfSnippetFail, + final boolean addResultsToLocalIdx) { long ab = MemoryControl.available(); if (ab < 1024 * 1024 * 200) { @@ -255,6 +257,7 @@ public final class SearchEvent { this.IAmaxcounthash = null; this.IAneardhthash = null; this.remote = (peers != null && peers.sizeConnected() > 0) && (this.query.domType == QueryParams.Searchdom.CLUSTER || (this.query.domType == QueryParams.Searchdom.GLOBAL && Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW_SEARCH, false))); + this.addResultsToLocalIndex = addResultsToLocalIdx; this.local_rwi_available = new AtomicInteger(0); // the number of results in the local peer after filtering this.local_rwi_stored = new AtomicInteger(0); this.local_solr_available = new AtomicInteger(0); diff --git a/source/net/yacy/search/query/SearchEventCache.java b/source/net/yacy/search/query/SearchEventCache.java index 9458bca07..bdae26b28 100644 --- a/source/net/yacy/search/query/SearchEventCache.java +++ b/source/net/yacy/search/query/SearchEventCache.java @@ -171,7 +171,8 @@ public class SearchEventCache { // start a new event Switchboard sb = Switchboard.getSwitchboard(); final boolean delete = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, true); - event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete); + final boolean addToLocalIdx = sb == null || Switchboard.getSwitchboard().getConfigBool(SwitchboardConstants.REMOTESEARCH_RESULT_STORE, true); + event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, delete, addToLocalIdx); MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads }