diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf new file mode 100644 index 000000000..3a312816a --- /dev/null +++ b/defaults/heuristicopensearch.conf @@ -0,0 +1,23 @@ +## List of search engines used by YaCy heuristic search option +## Format example +## SystemName = http://www.thesystem.org/search?q={searchTerms} +## all opensearch parameters can be used in search url +## {searchTerms} is replaced by search query +## {startIndex?} is replaced by result start +## {count} is replaced by expected number of results +## +## the syntax of this file: +## - all lines beginning with '##' are comments +## - all non-empty lines not beginning with '#' are keyword lines +## - all lines beginning with '#' and where the second character is not '#' are commented-out keyword lines +## + +#Nutch = http://www.search2.net/opensearch?query={searchTerms} # get 20 results from Nutch +#Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko +#Faroo-Web = http://www.faroo.com/instant.rss?q={searchTerms}&start={startIndex}&length={count}&l=en&src=web # get results from Faroo web-search +#Faroo-News = http://www.faroo.com/instant.rss?q={searchTerms}&start={startIndex}&length=20&l=en&src=news # get results from Faroo news-search +#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database +#Twitter = http://search.twitter.com/search.rss?rpp=20&q={searchTerms} +#WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs +#Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv +#Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 diff --git a/defaults/yacy.init b/defaults/yacy.init index 9e7b8318d..ac0466a7a 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -1020,6 +1020,7 @@ heuristic.blekko = false heuristic.twitter = false heuristic.searchresults = false heuristic.searchresults.crawlglobal = false +heuristic.opensearch = false # colours for generic design color_background = #FFFFFF diff --git a/htroot/ConfigHeuristics_p.html b/htroot/ConfigHeuristics_p.html index 365b32bd2..191bdc0e7 100644 --- a/htroot/ConfigHeuristics_p.html +++ b/htroot/ConfigHeuristics_p.html @@ -97,6 +97,66 @@ +
+
+ + + + +

+ When using this heuristic, then every search request line is used for a call to listed opensearch systems until enough results to fill the current search page are available. + 20 results are taken from remote system and loaded simultanously, parsed and indexed immediately. + To find out more about OpenSearch see OpenSearch.org +

+
+ +
+
+ Available/Active Opensearch System + + + + + + + + + #{osdcfg}# + + + + + + + + #{/osdcfg}# + + + + + + + +
ActiveTitleCommentUrl (format opensearch Url template syntax)delete
#[title]# #[comment]#
new
+
+
+ + #[osderrmsg]# +
+
+
+
+ +
+ With the button "discover from index" you can search within the metadata of your local index to find systems which support the Opensearch specification. + The task is started in the background. It may take some minutes before new entries appear (after refreshing the page). + Alternatively you may copy & paste a example config file located in defaults/heuristicopensearch.conf to the DATA/SETTINGS directory. + For the discover function the field outboundlinks_tag_txt (and inboundlinks_tag_txt) has to be switched on in the Solr Schema. + #{osdsolrfieldswitch}##{/osdsolrfieldswitch}# +
+
+
+ #%env/templates/footer.template%# diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index 7abd998dc..669399a5a 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -5,9 +5,9 @@ // // This is a part of YaCy, a peer-to-peer based web search engine // -// $LastChangedDate: 2010-02-09 18:14:16 +0100 (Di, 09 Feb 2010) $ -// $LastChangedRevision: 6658 $ -// $LastChangedBy: lotus $ +// $LastChangedDate: 2012-12-19 $ +// $LastChangedRevision: $ +// $LastChangedBy: reger $ // // LICENSE // @@ -25,9 +25,16 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +import com.google.common.io.Files; +import java.io.File; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.WorkTables; import net.yacy.search.Switchboard; +import java.io.IOException; +import java.util.Iterator; +import net.yacy.cora.federate.yacy.ConfigurationSet; +import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import net.yacy.cora.federate.solr.YaCySchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -38,6 +45,7 @@ public class ConfigHeuristics_p { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + String osderrmsg = ""; if (post != null) { // store this call as api call @@ -53,14 +61,160 @@ public class ConfigHeuristics_p { if (post.containsKey("blekko_off")) sb.setConfig("heuristic.blekko", false); if (post.containsKey("twitter_on")) sb.setConfig("heuristic.twitter", true); if (post.containsKey("twitter_off")) sb.setConfig("heuristic.twitter", false); + if (post.containsKey("opensearch_on")) { + sb.setConfig("heuristic.opensearch", true); + // re-read config (and create work table) + OpenSearchConnector os = new OpenSearchConnector(sb, true); + if (os.getSize() == 0) { + osderrmsg = "no active search targets are configured"; } + } + if (post.containsKey("opensearch_off")) sb.setConfig("heuristic.opensearch", false); + if (post.containsKey("discoverosd")) { + final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name()); + if (!metafieldNOTavailable) { + OpenSearchConnector osc = new OpenSearchConnector(sb, false); + if (osc.discoverFromSolrIndex(sb)) { + osderrmsg = "started background search for target systems, refresh page after some minutes"; + } else { + osderrmsg = "Solr index needs to be available and field outboundlinks_tag_txt on"; + } + } else { + osderrmsg = "Error: field outboundlinks_tag_txt needs to be activated in Solr index"; + } + } + + final String tmpurl = post.get("ossys_newurl"); + // if user entered new opensearch url but hit the wrong button, simulate "add" button + if (tmpurl != null && !tmpurl.isEmpty()) post.put("addnewosd", 1); + + if (post.containsKey("addnewosd")) { + // add new entry to config file + final String tmpname = post.get("ossys_newtitle"); + if (tmpname != null && tmpurl !=null) { + if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) { + final String tmpcomment = post.get("ossys_newcomment"); + OpenSearchConnector osc = new OpenSearchConnector(sb,false); + osc.add (tmpname,tmpurl,false,tmpcomment); + } else osderrmsg = "Url template must contain '{searchTerms}'"; + } + } + + if (post.containsKey("setopensearch")) { + // read index scheme table flags + writeopensearchcfg (sb,post); + } + + if (post.containsKey("switchsolrfieldson")) { + final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name()); + if (metafieldNOTavailable) { + ConfigurationSet.Entry entry; + entry = sb.index.fulltext().getSolrScheme().get(YaCySchema.outboundlinks_tag_txt.name()); + if (entry != null && !entry.enabled()) { + entry.setEnable(true); + } + entry = sb.index.fulltext().getSolrScheme().get(YaCySchema.inboundlinks_tag_txt.name()); + if (entry != null && !entry.enabled()) { + entry.setEnable(true); + } + try { + sb.index.fulltext().getSolrScheme().commit(); + } catch (IOException ex) {} + } + } + + // copy default opensearch heuristic config with sample entries + if (post.containsKey("copydefaultosdconfig")) { + // prepare a solr index profile switch list + final File osdDefaultConfig = new File(sb.getDataPath(), "defaults/heuristicopensearch.conf"); + final File osdConfig = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + if (!osdConfig.exists() && osdDefaultConfig.exists()) { + try { + Files.copy(osdDefaultConfig, osdConfig); + } catch (IOException ex) { + osderrmsg = "file I/O error during copy"; + } + } else {osderrmsg = "config file exists or default doesn't exist";} + } + } + + final boolean showmetafieldbutton = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name()); + if (showmetafieldbutton) prop.put("osdsolrfieldswitch",1); prop.put("site.checked", sb.getConfigBool("heuristic.site", false) ? 1 : 0); prop.put("searchresult.checked", sb.getConfigBool("heuristic.searchresults", false) ? 1 : 0); prop.put("searchresultglobal.checked", sb.getConfigBool("heuristic.searchresults.crawlglobal", false) ? 1 : 0); prop.put("blekko.checked", sb.getConfigBool("heuristic.blekko", false) ? 1 : 0); prop.put("twitter.checked", sb.getConfigBool("heuristic.twitter", false) ? 1 : 0); + prop.put("opensearch.checked", sb.getConfigBool("heuristic.opensearch", false) ? 1 : 0); + + // display config file content + final File f = new File (sb.getDataPath(),"DATA/SETTINGS/heuristicopensearch.conf"); + ConfigurationSet p = new ConfigurationSet(f); + int c = 0; + boolean dark = false; + Iterator i = p.entryIterator(); + while (i.hasNext()) { + ConfigurationSet.Entry e = i.next(); + prop.put("osdcfg_" + c + "_dark", dark ? 1 : 0); + dark = !dark; + prop.put("osdcfg_" + c + "_checked", e.enabled() ? 1 : 0); + prop.putHTML("osdcfg_" + c + "_title", e.key()); + prop.putHTML("osdcfg_" + c + "_comment", e.getComment() != null ? e.getComment() : ""); + + String tmps = e.getValue(); + prop.putHTML("osdcfg_" + c + "_url", tmps); + tmps = tmps.substring(0,tmps.lastIndexOf("/")); + prop.putHTML("osdcfg_" + c + "_urlhostlink", tmps); + c++; + } + prop.put("osdcfg", c); + prop.putHTML("osderrmsg",osderrmsg); return prop; } + + private static void writeopensearchcfg(final Switchboard sb, final serverObjects post) { + // read index scheme table flags + + final File f = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + ConfigurationSet cfg = new ConfigurationSet(f); + final Iterator cfgentries = cfg.entryIterator(); + ConfigurationSet.Entry entry; + boolean modified = false; // flag to remember changes + while (cfgentries.hasNext()) { + entry = cfgentries.next(); + final String sfn = post.get("ossys_url_" + entry.key()); + if (sfn != null) { + if (!sfn.equals(entry.getValue())) { + entry.setValue(sfn); + modified = true; +} + } + // set enable flag + String v = post.get("ossys_" + entry.key()); + boolean c = v != null && v.equals("checked"); + if (entry.enabled() != c) { + entry.setEnable(c); + modified = true; + } + // delete entry from config + v = post.get("ossys_del_" + entry.key()); + c = v != null && v.equals("checked"); + if (c) { + cfgentries.remove(); + modified = true; + } + } + if (modified) { // save settings to config file if modified + try { + cfg.commit(); + } catch (IOException ex) { + } + } + // re-read config (and create/update work table) + if (sb.getConfigBool("heuristic.opensearch", true)) { + OpenSearchConnector os = new OpenSearchConnector(sb, true); + } + } } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 3b02586fb..cdf45a285 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -1,1049 +1,1053 @@ -// yacysearch.java -// ----------------------- -// part of the AnomicHTTPD caching proxy -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// You must compile this file with -// javac -classpath .:../classes yacysearch.java -// if the shell's current path is HTROOT - -import java.io.IOException; -import java.net.InetAddress; -import java.util.ArrayList; -import java.util.Collection; -import java.util.ConcurrentModificationException; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.SortedSet; -import java.util.TreeSet; -import java.util.regex.Pattern; -import java.util.regex.PatternSyntaxException; - -import net.yacy.cora.document.analysis.Classification; -import net.yacy.cora.document.analysis.Classification.ContentDomain; -import net.yacy.cora.document.RSSMessage; -import net.yacy.cora.document.UTF8; -import net.yacy.cora.federate.yacy.CacheStrategy; -import net.yacy.cora.geo.GeoLocation; -import net.yacy.cora.lod.vocabulary.Tagging; -import net.yacy.cora.protocol.Domains; -import net.yacy.cora.protocol.HeaderFramework; -import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.protocol.ResponseHeader; -import net.yacy.data.DidYouMean; -import net.yacy.data.UserDB; -import net.yacy.data.ymark.YMarkTables; -import net.yacy.document.Condenser; -import net.yacy.document.Document; -import net.yacy.document.LibraryProvider; -import net.yacy.document.Parser; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.meta.URIMetadataNode; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.Bitfield; -import net.yacy.kelondro.util.Formatter; -import net.yacy.kelondro.util.ISO639; -import net.yacy.kelondro.util.MemoryControl; -import net.yacy.kelondro.util.SetTools; -import net.yacy.peers.EventChannel; -import net.yacy.peers.NewsPool; -import net.yacy.peers.graphics.ProfilingGraph; -import net.yacy.repository.Blacklist.BlacklistType; -import net.yacy.search.EventTracker; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import net.yacy.search.index.Segment; -import net.yacy.search.query.AccessTracker; -import net.yacy.search.query.QueryGoal; -import net.yacy.search.query.QueryParams; -import net.yacy.search.query.SearchEvent; -import net.yacy.search.query.SearchEventCache; -import net.yacy.search.query.SearchEventType; -import net.yacy.search.ranking.RankingProfile; -import net.yacy.search.snippet.TextSnippet; -import net.yacy.server.serverCore; -import net.yacy.server.serverObjects; -import net.yacy.server.serverSwitch; -import net.yacy.server.servletProperties; - -public class yacysearch { - - public static serverObjects respond( - final RequestHeader header, - final serverObjects post, - final serverSwitch env) { - final Switchboard sb = (Switchboard) env; - sb.localSearchLastAccess = System.currentTimeMillis(); - - final boolean searchAllowed = - sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header); - - boolean authenticated = sb.adminAuthenticated(header) >= 2; - if ( !authenticated ) { - final UserDB.Entry user = sb.userDB.getUser(header); - authenticated = (user != null && user.hasRight(UserDB.AccessRight.EXTENDED_SEARCH_RIGHT)); - } - final boolean localhostAccess = header.accessFromLocalhost(); - final String promoteSearchPageGreeting = - (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? env.getConfig( - "network.unit.description", - "") : env.getConfig(SwitchboardConstants.GREETING, ""); - final String client = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the search client who initiated the search - - // get query - final String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); - String querystring = originalquerystring.replace('+', ' ').trim(); - CacheStrategy snippetFetchStrategy = (post == null) ? null : CacheStrategy.parse(post.get("verify", sb.getConfig("search.verify", ""))); - if (authenticated && originalquerystring.length() == 0) sb.index.fulltext().commit(); - - final servletProperties prop = new servletProperties(); - prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); - - //get focus option - prop.put("focus", ((post == null) ? true : post.get("focus", "1").equals("1")) ? 1 : 0); - - // produce vocabulary navigation sidebars - Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); - int j = 0; - for (Tagging v: vocabularies) { - prop.put("sidebarVocabulary_" + j + "_vocabulary", v.getName()); - j++; - } - prop.put("sidebarVocabulary", j); - - // get segment - Segment indexSegment = sb.index; - - final String EXT = header.get("EXT", ""); - final boolean rss = EXT.equals("rss"); - final boolean json = EXT.equals("json"); - prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting); - prop.put( - "promoteSearchPageGreeting.homepage", - sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, "")); - prop.put( - "promoteSearchPageGreeting.smallImage", - sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, "")); - if ( post == null || indexSegment == null || env == null || !searchAllowed ) { - // we create empty entries for template strings - prop.put("searchagain", "0"); - prop.put("former", ""); - prop.put("count", "10"); - prop.put("offset", "0"); - prop.put("resource", "global"); - prop.put("urlmaskfilter", (post == null) ? ".*" : post.get("urlmaskfilter", ".*")); - prop.put("prefermaskfilter", (post == null) ? "" : post.get("prefermaskfilter", "")); - prop.put("tenant", (post == null) ? "" : post.get("tenant", "")); - prop.put("indexof", "off"); - prop.put("constraint", ""); - prop.put("cat", "href"); - prop.put("depth", "0"); - prop.put( - "search.verify", - (post == null) ? sb.getConfig("search.verify", "iffresh") : post.get("verify", "iffresh")); - prop.put( - "search.navigation", - (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all")); - prop.put("contentdom", "text"); - prop.put("contentdomCheckText", "1"); - prop.put("contentdomCheckAudio", "0"); - prop.put("contentdomCheckVideo", "0"); - prop.put("contentdomCheckImage", "0"); - prop.put("contentdomCheckApp", "0"); - prop.put("excluded", "0"); - prop.put("results", ""); - prop.put("resultTable", "0"); - prop.put("num-results", searchAllowed ? "0" : "4"); - prop.put("num-results_totalcount", 0); - prop.put("num-results_offset", 0); - prop.put("num-results_itemsPerPage", 10); - prop.put("geoinfo", "0"); - prop.put("rss_queryenc", ""); - prop.put("meanCount", 5); - return prop; - } - - // check for JSONP - if ( post.containsKey("callback") ) { - final String jsonp = post.get("callback") + "(["; - prop.put("jsonp-start", jsonp); - prop.put("jsonp-end", "])"); - } else { - prop.put("jsonp-start", ""); - prop.put("jsonp-end", ""); - } - - // Adding CORS Access header for yacysearch.rss output - if ( rss ) { - final ResponseHeader outgoingHeader = new ResponseHeader(200); - outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); - prop.setOutgoingHeader(outgoingHeader); - } - - // collect search attributes - - int itemsPerPage = - Math.min( - (authenticated) - ? (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() - ? 100 - : 5000) : (snippetFetchStrategy != null - && snippetFetchStrategy.isAllowedToFetchOnline() ? 20 : 1000), - post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative - int startRecord = post.getInt("startRecord", post.getInt("offset", 0)); - - boolean global = post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0; - final boolean indexof = (post != null && post.get("indexof", "").equals("on")); - - String prefermask = (post == null) ? "" : post.get("prefermaskfilter", ""); - if ( !prefermask.isEmpty() && prefermask.indexOf(".*", 0) < 0 ) { - prefermask = ".*" + prefermask + ".*"; - } - - Bitfield constraint = - (post != null && post.containsKey("constraint") && !post.get("constraint", "").isEmpty()) - ? new Bitfield(4, post.get("constraint", "______")) - : null; - if ( indexof ) { - constraint = new Bitfield(4); - constraint.set(Condenser.flag_cat_indexof, true); - } - - // SEARCH - final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER); - final boolean indexReceiveGranted = - sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) - || sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true) - || clustersearch; - global = global && indexReceiveGranted; // if the user does not want indexes from remote peers, it cannot be a global searchnn - final boolean intranetMode = sb.isIntranetMode() || sb.isAllIPMode(); - - // increase search statistic counter - if ( !global ) { - // we count only searches on the local peer here, because global searches - // are counted on the target peer to preserve privacy of the searcher - if ( authenticated ) { - // local or authenticated search requests are counted separately - // because they are not part of a public available peer statistic - sb.searchQueriesRobinsonFromLocal++; - } else { - // robinson-searches from non-authenticated requests are public - // and may be part of the public available statistic - sb.searchQueriesRobinsonFromRemote++; - } - } - - // find search domain - final Classification.ContentDomain contentdom = - ContentDomain.contentdomParser(post == null ? "all" : post.get("contentdom", "all")); - - // patch until better search profiles are available - if (contentdom == ContentDomain.IMAGE && (itemsPerPage == 10 || itemsPerPage == 100)) { - itemsPerPage = 64; - } else if ( contentdom != ContentDomain.IMAGE && itemsPerPage > 50 && itemsPerPage < 100 ) { - itemsPerPage = 10; - } - - // check the search tracker - TreeSet trackerHandles = sb.localSearchTracker.get(client); - if ( trackerHandles == null ) { - trackerHandles = new TreeSet(); - } - boolean block = false; - if ( Domains.matchesList(client, sb.networkBlacklist) ) { - global = false; - if ( snippetFetchStrategy != null ) { - snippetFetchStrategy = null; - } - block = true; - Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " - + client - + " gets no permission to search"); - } else if ( Domains.matchesList(client, sb.networkWhitelist) ) { - Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " - + client - + " gets no search restrictions"); - } else if ( !authenticated && !localhostAccess && !intranetMode ) { - // in case that we do a global search or we want to fetch snippets, we check for DoS cases - synchronized ( trackerHandles ) { - final int accInThreeSeconds = - trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size(); - final int accInOneMinute = - trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size(); - final int accInTenMinutes = - trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size(); - // protections against too strong YaCy network load, reduces remote search - if ( global ) { - if ( accInTenMinutes >= 60 || accInOneMinute >= 6 || accInThreeSeconds >= 1 ) { - global = false; - Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " - + client - + ": " - + accInThreeSeconds - + "/3s, " - + accInOneMinute - + "/60s, " - + accInTenMinutes - + "/600s, " - + " requests, disallowed global search"); - } - } - // protection against too many remote server snippet loads (protects traffic on server) - if ( snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ) { - if ( accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1 ) { - snippetFetchStrategy = CacheStrategy.CACHEONLY; - Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " - + client - + ": " - + accInThreeSeconds - + "/3s, " - + accInOneMinute - + "/60s, " - + accInTenMinutes - + "/600s, " - + " requests, disallowed remote snippet loading"); - } - } - // general load protection - if ( accInTenMinutes >= 3000 || accInOneMinute >= 600 || accInThreeSeconds >= 60 ) { - block = true; - Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " - + client - + ": " - + accInThreeSeconds - + "/3s, " - + accInOneMinute - + "/60s, " - + accInTenMinutes - + "/600s, " - + " requests, disallowed search"); - } - } - } - - if ( !block && (post == null || post.get("cat", "href").equals("href")) ) { - String urlmask = null; - String protocol = null; - String tld = null; - String ext = null; - - // check available memory and clean up if necessary - if ( !MemoryControl.request(8000000L, false) ) { - indexSegment.fulltext().clearCache(); - SearchEventCache.cleanupEvents(false); - } - - final RankingProfile ranking = sb.getRanking(); - final StringBuilder modifier = new StringBuilder(20); - - int stp = querystring.indexOf('*'); - if (stp >= 0) { - querystring = querystring.substring(0, stp) + Segment.catchallString + querystring.substring(stp + 1); - } - if ( querystring.indexOf("/near", 0) >= 0 ) { - querystring = querystring.replace("/near", ""); - ranking.allZero(); // switch off all attributes - ranking.coeff_worddistance = RankingProfile.COEFF_MAX; - modifier.append("/near "); - } - if ( querystring.indexOf("/date", 0) >= 0 ) { - querystring = querystring.replace("/date", ""); - ranking.allZero(); // switch off all attributes - ranking.coeff_date = RankingProfile.COEFF_MAX; - modifier.append("/date "); - } - if ( querystring.indexOf("/https", 0) >= 0 ) { - querystring = querystring.replace("/https", ""); - protocol = "https"; - modifier.append("/https "); - } else if ( querystring.indexOf("/http", 0) >= 0 ) { - querystring = querystring.replace("/http", ""); - protocol = "http"; - modifier.append("/http "); - } - if ( querystring.indexOf("/ftp", 0) >= 0 ) { - querystring = querystring.replace("/ftp", ""); - protocol = "ftp"; - modifier.append("/ftp "); - } - if ( querystring.indexOf("/smb", 0) >= 0 ) { - querystring = querystring.replace("/smb", ""); - protocol = "smb"; - modifier.append("/smb "); - } - - if ( querystring.indexOf("/file", 0) >= 0 ) { - querystring = querystring.replace("/file", ""); - protocol = "file"; - modifier.append("/file "); - } - - if ( querystring.indexOf("/location", 0) >= 0 ) { - querystring = querystring.replace("/location", ""); - if ( constraint == null ) { - constraint = new Bitfield(4); - } - constraint.set(Condenser.flag_cat_haslocation, true); - modifier.append("/location "); - } - - final int lrp = querystring.indexOf("/language/", 0); - String language = ""; - if ( lrp >= 0 ) { - if ( querystring.length() >= (lrp + 12) ) { - language = querystring.substring(lrp + 10, lrp + 12); - } - querystring = querystring.replace("/language/" + language, ""); - language = language.toLowerCase(); - modifier.append("/language/").append(language).append(' '); - } - - final int inurl = querystring.indexOf("inurl:", 0); - if ( inurl >= 0 ) { - int ftb = querystring.indexOf(' ', inurl); - if ( ftb == -1 ) { - ftb = querystring.length(); - } - final String urlstr = querystring.substring(inurl + 6, ftb); - querystring = querystring.replace("inurl:" + urlstr, ""); - if ( !urlstr.isEmpty() ) { - urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*"; - } - modifier.append("inurl:").append(urlstr).append(' '); - } - - final int filetype = querystring.indexOf("filetype:", 0); - if ( filetype >= 0 ) { - int ftb = querystring.indexOf(' ', filetype); - if ( ftb == -1 ) { - ftb = querystring.length(); - } - ext = querystring.substring(filetype + 9, ftb); - querystring = querystring.replace("filetype:" + ext, ""); - while ( !ext.isEmpty() && ext.charAt(0) == '.' ) { - ext = ext.substring(1); - } - modifier.append("filetype:").append(ext).append(' '); - if (ext.isEmpty()) ext = null; - } - - int voc = 0; - Collection metatags = new ArrayList(1); - while ((voc = querystring.indexOf("/vocabulary/", 0)) >= 0) { - String vocabulary = ""; - int ve = querystring.indexOf(' ', voc + 12); - if (ve < 0) { - vocabulary = querystring.substring(voc); - querystring = querystring.substring(0, voc).trim(); - } else { - vocabulary = querystring.substring(voc, ve); - querystring = querystring.substring(0, voc) + querystring.substring(ve); - } - modifier.append(vocabulary).append(' '); - vocabulary = vocabulary.substring(12); - int p = vocabulary.indexOf('/'); - if (p > 0) { - String k = vocabulary.substring(0, p); - String v = vocabulary.substring(p + 1); - metatags.add(LibraryProvider.autotagging.metatag(k, v)); - } - } - - int radius = 0; - double lon = 0.0d, lat = 0.0d, rad = 0.0d; - if ((radius = querystring.indexOf("/radius/")) >= 0) { - int ve = querystring.indexOf(' ', radius + 8); - String geo = ""; - if (ve < 0) { - geo = querystring.substring(radius); - querystring = querystring.substring(0, radius).trim(); - } else { - geo = querystring.substring(radius, ve); - querystring = querystring.substring(0, radius) + querystring.substring(ve); - } - geo = geo.substring(8); - String[] sp = geo.split("/"); - if (sp.length == 3) try { - lat = Double.parseDouble(sp[0]); - lon = Double.parseDouble(sp[1]); - rad = Double.parseDouble(sp[2]); - } catch (NumberFormatException e) { - lon = 0.0d; lat = 0.0d; rad = 0.0d; - } - } - - final int site = querystring.indexOf("site:", 0); - String sitehash = null; - String sitehost = null; - if ( site >= 0 ) { - int ftb = querystring.indexOf(' ', site); - if ( ftb == -1 ) { - ftb = querystring.length(); - } - sitehost = querystring.substring(site + 5, ftb); - querystring = querystring.replace("site:" + sitehost, ""); - while ( sitehost.length() > 0 && sitehost.charAt(0) == '.' ) { - sitehost = sitehost.substring(1); - } - while ( sitehost.endsWith(".") ) { - sitehost = sitehost.substring(0, sitehost.length() - 1); - } - sitehash = DigestURI.hosthash(sitehost); - modifier.append("site:").append(sitehost).append(' '); - } - - final int heuristicBlekko = querystring.indexOf("/heuristic/blekko", 0); - if ( heuristicBlekko >= 0 ) { - querystring = querystring.replace("/heuristic/blekko", ""); - modifier.append("/heuristic/blekko "); - } - - final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0); - if ( heuristicBlekko >= 0 ) { - querystring = querystring.replace("/heuristic/twitter", ""); - modifier.append("/heuristic/twitter "); - } - - final int authori = querystring.indexOf("author:", 0); - String author = null; - if ( authori >= 0 ) { - // check if the author was given with single quotes or without - final boolean quotes = (querystring.charAt(authori + 7) == '('); - if ( quotes ) { - int ftb = querystring.indexOf(')', authori + 8); - if (ftb == -1) ftb = querystring.length() + 1; - author = querystring.substring(authori + 8, ftb); - querystring = querystring.replace("author:(" + author + ")", ""); - modifier.append("author:(").append(author).append(") "); - } else { - int ftb = querystring.indexOf(' ', authori); - if ( ftb == -1 ) { - ftb = querystring.length(); - } - author = querystring.substring(authori + 7, ftb); - querystring = querystring.replace("author:" + author, ""); - modifier.append("author:").append(author).append(' '); - } - } - - final int tldp = querystring.indexOf("tld:", 0); - if (tldp >= 0) { - int ftb = querystring.indexOf(' ', tldp); - if (ftb == -1) ftb = querystring.length(); - tld = querystring.substring(tldp + 4, ftb); - querystring = querystring.replace("tld:" + tld, ""); - modifier.append("tld:").append(tld).append(' '); - while ( tld.length() > 0 && tld.charAt(0) == '.' ) { - tld = tld.substring(1); - } - if (tld.length() == 0) tld = null; - } - if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given - - // read the language from the language-restrict option 'lr' - // if no one is given, use the user agent or the system language as default - language = (post == null) ? language : post.get("lr", language); - if ( language.startsWith("lang_") ) { - language = language.substring(5); - } - if ( !ISO639.exists(language) ) { - // find out language of the user by reading of the user-agent string - String agent = header.get(HeaderFramework.ACCEPT_LANGUAGE); - if ( agent == null ) { - agent = System.getProperty("user.language"); - } - language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); - if ( language == null ) { - language = "en"; - } - } - - // the query - final QueryGoal qg = new QueryGoal(originalquerystring, querystring.trim()); - final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllHashes().size() - 1 : Integer.MAX_VALUE; - - // filter out stopwords - final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); - if ( !filtered.isEmpty() ) { - SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), Switchboard.stopwords); - } - - // if a minus-button was hit, remove a special reference first - if ( post != null && post.containsKey("deleteref") ) { - try { - if ( !sb.verifyAuthentication(header) ) { - prop.authenticationRequired(); - return prop; - } - - // delete the index entry locally - final String delHash = post.get("deleteref", ""); // urlhash - indexSegment.termIndex().remove(qg.getIncludeHashes(), delHash.getBytes()); - - // make new news message with negative voting - if ( !sb.isRobinsonMode() ) { - final Map map = new HashMap(); - map.put("urlhash", delHash); - map.put("vote", "negative"); - map.put("refid", ""); - sb.peers.newsPool.publishMyNews( - sb.peers.mySeed(), - NewsPool.CATEGORY_SURFTIPP_VOTE_ADD, - map); - } - - // delete the search history since this still shows the entry - SearchEventCache.delete(delHash); - } catch ( final IOException e ) { - Log.logException(e); - } - } - - // if a plus-button was hit, create new voting message - if ( post != null && post.containsKey("recommendref") ) { - if ( !sb.verifyAuthentication(header) ) { - prop.authenticationRequired(); - return prop; - } - final String recommendHash = post.get("recommendref", ""); // urlhash - final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash)); - if ( urlentry != null ) { - Document[] documents = null; - try { - documents = - sb.loader.loadDocuments( - sb.loader.request(urlentry.url(), true, false), - CacheStrategy.IFEXIST, - Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); - } catch ( final IOException e ) { - } catch ( final Parser.Failure e ) { - } - if ( documents != null ) { - // create a news message - final Map map = new HashMap(); - map.put("url", urlentry.url().toNormalform(true).replace(',', '|')); - map.put("title", urlentry.dc_title().replace(',', ' ')); - map.put("description", documents[0].dc_title().replace(',', ' ')); - map.put("author", documents[0].dc_creator()); - map.put("tags", documents[0].dc_subject(' ')); - sb.peers.newsPool.publishMyNews( - sb.peers.mySeed(), - NewsPool.CATEGORY_SURFTIPP_ADD, - map); - documents[0].close(); - } - } - } - - // if a bookmarks-button was hit, create new bookmark entry - if ( post != null && post.containsKey("bookmarkref") ) { - if ( !sb.verifyAuthentication(header) ) { - prop.authenticationRequired(); - return prop; - } - final String bookmarkHash = post.get("bookmarkref", ""); // urlhash - final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash)); - if ( url != null ) { - try { - sb.tables.bookmarks.createBookmark( - sb.loader, - url, - YMarkTables.USER_ADMIN, - true, - "searchresult", - "/search"); - } catch ( final Throwable e ) { - } - } - } - - // check filters - try { - Pattern.compile(urlmask); - } catch ( final PatternSyntaxException ex ) { - SearchEvent.log.logWarning("Illegal URL mask, not a valid regex: " + urlmask); - prop.put("urlmaskerror", 1); - prop.putHTML("urlmaskerror_urlmask", urlmask); - urlmask = ".*"; - } - - try { - Pattern.compile(prefermask); - } catch ( final PatternSyntaxException ex ) { - SearchEvent.log.logWarning("Illegal prefer mask, not a valid regex: " + prefermask); - prop.put("prefermaskerror", 1); - prop.putHTML("prefermaskerror_prefermask", prefermask); - prefermask = ""; - } - - // do the search - final QueryParams theQuery = - new QueryParams( - qg, - modifier.toString().trim(), - maxDistance, - prefermask, - contentdom, - language, - metatags, - snippetFetchStrategy, - itemsPerPage, - startRecord, - urlmask, protocol, tld, ext, - clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted - ? QueryParams.Searchdom.GLOBAL - : QueryParams.Searchdom.LOCAL), - 20, - constraint, - true, - sitehash, - sitehost, - DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")), - author, - DigestURI.TLD_any_zone_filter, - client, - authenticated, - indexSegment, - ranking, - header.get(RequestHeader.USER_AGENT, ""), - sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false) - && sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) - && sb.peers.mySeed().getFlagAcceptRemoteIndex(), - lat, lon, rad); - EventTracker.delete(EventTracker.EClass.SEARCH); - EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( - theQuery.id(true), - SearchEventType.INITIALIZATION, - "", - 0, - 0), false); - - // tell all threads to do nothing for a specific time - sb.intermissionAllThreads(3000); - - // filter out words that appear in bluelist - theQuery.getQueryGoal().filterOut(Switchboard.blueList); - - // log - Log.logInfo( - "LOCAL_SEARCH", - "INIT WORD SEARCH: " - + theQuery.getQueryGoal().getOriginalQueryString(false) - + ":" - + QueryParams.hashSet2hashString(theQuery.getQueryGoal().getIncludeHashes()) - + " - " - + theQuery.neededResults() - + " links to be computed, " - + theQuery.itemsPerPage() - + " lines to be displayed"); - EventChannel.channels(EventChannel.LOCALSEARCH).addMessage( - new RSSMessage("Local Search Request", theQuery.getQueryGoal().getOriginalQueryString(false), "")); - final long timestamp = System.currentTimeMillis(); - - // create a new search event - if ( SearchEventCache.getEvent(theQuery.id(false)) == null ) { - theQuery.setOffset(0); // in case that this is a new search, always start without a offset - startRecord = 0; - } - final SearchEvent theSearch = - SearchEventCache.getEvent( - theQuery, - sb.peers, - sb.tables, - (sb.isRobinsonMode()) ? sb.clusterhashes : null, - false, - sb.loader, - (int) sb.getConfigLong( - SwitchboardConstants.REMOTESEARCH_MAXCOUNT_USER, - sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_DEFAULT, 10)), - sb.getConfigLong( - SwitchboardConstants.REMOTESEARCH_MAXTIME_USER, - sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000)), - (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), - (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); - - if ( startRecord == 0 ) { - if ( sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) { - sb.heuristicSite(theSearch, sitehost); - } - if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) { - sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); - } - if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) { - sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter"); - } - } - - // log - Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " - + theQuery.getQueryGoal().getOriginalQueryString(false) - + " - " - + "local_rwi_available(" + theSearch.query.local_rwi_available.get() + "), " - + "local_rwi_stored(" + theSearch.query.local_rwi_stored.get() + "), " - + "local_solr_available(" + theSearch.query.local_solr_available.get() + "), " - + "local_solr_stored(" + theSearch.query.local_solr_stored.get() + "), " - + "remote_available(" + theSearch.query.remote_available.get() + "), " - + "remote_stored(" + theSearch.query.remote_stored.get() + "), " - + "remote_peerCount(" + theSearch.query.remote_peerCount.get() + "), " - + "local_sortout(" + theSearch.query.misses.size() + "), " - + (System.currentTimeMillis() - timestamp) - + " ms"); - - // prepare search statistics - theQuery.searchtime = System.currentTimeMillis() - timestamp; - theQuery.urlretrievaltime = theSearch.getURLRetrievalTime(); - theQuery.snippetcomputationtime = theSearch.getSnippetComputationTime(); - AccessTracker.add(AccessTracker.Location.local, theQuery); - - // check suggestions - final int meanMax = (post != null) ? post.getInt("meanCount", 0) : 0; - - prop.put("meanCount", meanMax); - if ( meanMax > 0 && !json && !rss ) { - final DidYouMean didYouMean = new DidYouMean(indexSegment, new StringBuilder(querystring)); - final Iterator meanIt = didYouMean.getSuggestions(100, 5).iterator(); - int meanCount = 0; - String suggestion; - try { - meanCollect: while ( meanCount < meanMax && meanIt.hasNext() ) { - try { - suggestion = meanIt.next().toString(); - prop.put("didYouMean_suggestions_" + meanCount + "_word", suggestion); - prop.put( - "didYouMean_suggestions_" + meanCount + "_url", - QueryParams.navurl( - "html", - 0, - theQuery, - suggestion).toString()); - prop.put("didYouMean_suggestions_" + meanCount + "_sep", "|"); - meanCount++; - } catch (ConcurrentModificationException e) {break meanCollect;} - } - } catch (ConcurrentModificationException e) {} - prop.put("didYouMean_suggestions_" + (meanCount - 1) + "_sep", ""); - prop.put("didYouMean", meanCount > 0 ? 1 : 0); - prop.put("didYouMean_suggestions", meanCount); - } else { - prop.put("didYouMean", 0); - } - - // find geographic info - final SortedSet coordinates = LibraryProvider.geoLoc.find(originalquerystring, false); - if ( coordinates == null || coordinates.isEmpty() || startRecord > 0 ) { - prop.put("geoinfo", "0"); - } else { - int i = 0; - for ( final GeoLocation c : coordinates ) { - prop.put("geoinfo_loc_" + i + "_lon", Math.round(c.lon() * 10000.0f) / 10000.0f); - prop.put("geoinfo_loc_" + i + "_lat", Math.round(c.lat() * 10000.0f) / 10000.0f); - prop.put("geoinfo_loc_" + i + "_name", c.getName()); - i++; - if ( i >= 10 ) { - break; - } - } - prop.put("geoinfo_loc", i); - prop.put("geoinfo", "1"); - } - - // update the search tracker - try { - synchronized ( trackerHandles ) { - trackerHandles.add(theQuery.starttime); - while ( trackerHandles.size() > 600 ) { - if ( !trackerHandles.remove(trackerHandles.first()) ) { - break; - } - } - } - sb.localSearchTracker.put(client, trackerHandles); - if ( sb.localSearchTracker.size() > 100 ) { - sb.localSearchTracker.remove(sb.localSearchTracker.keys().nextElement()); - } - if ( MemoryControl.shortStatus() ) { - sb.localSearchTracker.clear(); - } - } catch ( final Exception e ) { - Log.logException(e); - } - - prop.put("num-results_offset", startRecord == 0 ? 0 : startRecord + 1); - prop.put("num-results_itemscount", Formatter.number(startRecord + theSearch.query.itemsPerPage > theSearch.query.getResultCount() ? startRecord + theSearch.query.getResultCount() % theSearch.query.itemsPerPage : startRecord + theSearch.query.itemsPerPage, true)); - prop.put("num-results_itemsPerPage", Formatter.number(itemsPerPage)); - prop.put("num-results_totalcount", Formatter.number(theSearch.query.getResultCount())); - prop.put("num-results_globalresults", global && (indexReceiveGranted || clustersearch) ? "1" : "0"); - prop.put("num-results_globalresults_localResourceSize", Formatter.number(theSearch.query.local_rwi_available.get() + theSearch.query.local_solr_available.get(), true)); - prop.put("num-results_globalresults_localMissCount", Formatter.number(theSearch.query.misses.size(), true)); - prop.put("num-results_globalresults_remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true)); - prop.put("num-results_globalresults_remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true)); - prop.put("num-results_globalresults_remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true)); - - // compose page navigation - final StringBuilder resnav = new StringBuilder(200); - final int thispage = startRecord / theQuery.itemsPerPage(); - if ( thispage == 0 ) { - resnav - .append("\"arrowleft\" "); - } else { - resnav.append("\"arrowleft\" "); - } - final int numberofpages = Math.min(10, 1 + ((theSearch.query.getResultCount() - 1) / theQuery.itemsPerPage())); - - for ( int i = 0; i < numberofpages; i++ ) { - if ( i == thispage ) { - resnav.append("\"page"); "); - } else { - resnav.append("\"page"); "); - } - } - if ( thispage >= numberofpages ) { - resnav - .append("\"arrowright\""); - } else { - resnav.append("\"arrowright\""); - } - final String resnavs = resnav.toString(); - prop.put("num-results_resnav", resnavs); - prop.put("pageNavBottom", (theSearch.query.getResultCount() - startRecord > 6) ? 1 : 0); // if there are more results than may fit on the page we add a navigation at the bottom - prop.put("pageNavBottom_resnav", resnavs); - - // generate the search result lines; the content will be produced by another servlet - for ( int i = 0; i < theQuery.itemsPerPage(); i++ ) { - prop.put("results_" + i + "_item", startRecord + i); - prop.put("results_" + i + "_eventID", theQuery.id(false)); - } - prop.put("results", theQuery.itemsPerPage()); - prop - .put( - "resultTable", - (contentdom == ContentDomain.APP || contentdom == ContentDomain.AUDIO || contentdom == ContentDomain.VIDEO) - ? 1 - : 0); - prop.put("eventID", theQuery.id(false)); // for bottomline - - // process result of search - if ( !filtered.isEmpty() ) { - prop.put("excluded", "1"); - prop.putHTML("excluded_stopwords", filtered.toString()); - } else { - prop.put("excluded", "0"); - } - - if ( prop == null || prop.isEmpty() ) { - if ( post.get("query", post.get("search", "")).length() < 2 ) { - prop.put("num-results", "2"); // no results - at least 2 chars - } else { - prop.put("num-results", "1"); // no results - } - } else { - prop.put("num-results", "3"); - } - - prop.put("cat", "href"); - prop.put("depth", "0"); - - // adding some additional properties needed for the rss feed - String hostName = header.get("Host", Domains.LOCALHOST); - if ( hostName.indexOf(':', 0) == -1 ) { - hostName += ":" + serverCore.getPortNr(env.getConfig("port", "8090")); - } - prop.put("searchBaseURL", "http://" + hostName + "/yacysearch.html"); - prop.put("rssYacyImageURL", "http://" + hostName + "/env/grafics/yacy.gif"); - prop.put("thisaddress", hostName); - } - - prop.put("searchagain", global ? "1" : "0"); - prop.putHTML("former", originalquerystring); - prop.put("count", itemsPerPage); - prop.put("offset", startRecord); - prop.put("resource", global ? "global" : "local"); - prop.putHTML("prefermaskfilter", prefermask); - prop.put("indexof", (indexof) ? "on" : "off"); - prop.put("constraint", (constraint == null) ? "" : constraint.exportB64()); - prop.put("search.verify", snippetFetchStrategy == null - ? sb.getConfig("search.verify", "iffresh") - : snippetFetchStrategy.toName()); - prop.put( - "search.navigation", - (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all")); - prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text"))); - prop.put( - "searchdomswitches", - sb.getConfigBool("search.text", true) - || sb.getConfigBool("search.audio", true) - || sb.getConfigBool("search.video", true) - || sb.getConfigBool("search.image", true) - || sb.getConfigBool("search.app", true) ? 1 : 0); - prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0); - prop.put("searchdomswitches_searchaudio", sb.getConfigBool("search.audio", true) ? 1 : 0); - prop.put("searchdomswitches_searchvideo", sb.getConfigBool("search.video", true) ? 1 : 0); - prop.put("searchdomswitches_searchimage", sb.getConfigBool("search.image", true) ? 1 : 0); - prop.put("searchdomswitches_searchapp", sb.getConfigBool("search.app", true) ? 1 : 0); - prop.put("searchdomswitches_searchtext_check", (contentdom == ContentDomain.TEXT || contentdom == ContentDomain.ALL) ? "1" : "0"); - prop.put("searchdomswitches_searchaudio_check", (contentdom == ContentDomain.AUDIO) ? "1" : "0"); - prop.put("searchdomswitches_searchvideo_check", (contentdom == ContentDomain.VIDEO) ? "1" : "0"); - prop.put("searchdomswitches_searchimage_check", (contentdom == ContentDomain.IMAGE) ? "1" : "0"); - prop.put("searchdomswitches_searchapp_check", (contentdom == ContentDomain.APP) ? "1" : "0"); - - // copy properties for "more options" link - prop.put("searchdomswitches_count", prop.get("count")); - prop.put("searchdomswitches_urlmaskfilter", prop.get("urlmaskfilter")); - prop.put("searchdomswitches_prefermaskfilter", prop.get("prefermaskfilter")); - prop.put("searchdomswitches_cat", prop.get("cat")); - prop.put("searchdomswitches_constraint", prop.get("constraint")); - prop.put("searchdomswitches_contentdom", prop.get("contentdom")); - prop.put("searchdomswitches_former", prop.get("former")); - prop.put("searchdomswitches_meanCount", prop.get("meanCount")); - - // for RSS: don't HTML encode some elements - prop.putXML("rss_query", originalquerystring); - prop.putXML("rss_queryenc", originalquerystring.replace(' ', '+')); - - sb.localSearchLastAccess = System.currentTimeMillis(); - - // hostname and port (assume locahost if nothing helps) - final InetAddress hostIP = Domains.myPublicLocalIP(); - prop.put("myhost", hostIP != null ? hostIP.getHostAddress() : Domains.LOCALHOST); - prop.put("myport", serverCore.getPortNr(sb.getConfig("port", "8090"))); - - // return rewrite properties - return prop; - } -} +// yacysearch.java +// ----------------------- +// part of the AnomicHTTPD caching proxy +// (C) by Michael Peter Christen; mc@yacy.net +// first published on http://www.anomic.de +// Frankfurt, Germany, 2004 +// +// $LastChangedDate$ +// $LastChangedRevision$ +// $LastChangedBy$ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// You must compile this file with +// javac -classpath .:../classes yacysearch.java +// if the shell's current path is HTROOT + +import java.io.IOException; +import java.net.InetAddress; +import java.util.ArrayList; +import java.util.Collection; +import java.util.ConcurrentModificationException; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.SortedSet; +import java.util.TreeSet; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.UTF8; +import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.geo.GeoLocation; +import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.data.DidYouMean; +import net.yacy.data.UserDB; +import net.yacy.data.ymark.YMarkTables; +import net.yacy.document.Condenser; +import net.yacy.document.Document; +import net.yacy.document.LibraryProvider; +import net.yacy.document.Parser; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.Bitfield; +import net.yacy.kelondro.util.Formatter; +import net.yacy.kelondro.util.ISO639; +import net.yacy.kelondro.util.MemoryControl; +import net.yacy.kelondro.util.SetTools; +import net.yacy.peers.EventChannel; +import net.yacy.peers.NewsPool; +import net.yacy.peers.graphics.ProfilingGraph; +import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.search.EventTracker; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.search.index.Segment; +import net.yacy.search.query.AccessTracker; +import net.yacy.search.query.QueryGoal; +import net.yacy.search.query.QueryParams; +import net.yacy.search.query.SearchEvent; +import net.yacy.search.query.SearchEventCache; +import net.yacy.search.query.SearchEventType; +import net.yacy.search.ranking.RankingProfile; +import net.yacy.search.snippet.TextSnippet; +import net.yacy.server.serverCore; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; +import net.yacy.server.servletProperties; + +public class yacysearch { + + public static serverObjects respond( + final RequestHeader header, + final serverObjects post, + final serverSwitch env) { + final Switchboard sb = (Switchboard) env; + sb.localSearchLastAccess = System.currentTimeMillis(); + + final boolean searchAllowed = + sb.getConfigBool("publicSearchpage", true) || sb.verifyAuthentication(header); + + boolean authenticated = sb.adminAuthenticated(header) >= 2; + if ( !authenticated ) { + final UserDB.Entry user = sb.userDB.getUser(header); + authenticated = (user != null && user.hasRight(UserDB.AccessRight.EXTENDED_SEARCH_RIGHT)); + } + final boolean localhostAccess = header.accessFromLocalhost(); + final String promoteSearchPageGreeting = + (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) ? env.getConfig( + "network.unit.description", + "") : env.getConfig(SwitchboardConstants.GREETING, ""); + final String client = header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP); // the search client who initiated the search + + // get query + final String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); + String querystring = originalquerystring.replace('+', ' ').trim(); + CacheStrategy snippetFetchStrategy = (post == null) ? null : CacheStrategy.parse(post.get("verify", sb.getConfig("search.verify", ""))); + if (authenticated && originalquerystring.length() == 0) sb.index.fulltext().commit(); + + final servletProperties prop = new servletProperties(); + prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0); + + //get focus option + prop.put("focus", ((post == null) ? true : post.get("focus", "1").equals("1")) ? 1 : 0); + + // produce vocabulary navigation sidebars + Collection vocabularies = LibraryProvider.autotagging.getVocabularies(); + int j = 0; + for (Tagging v: vocabularies) { + prop.put("sidebarVocabulary_" + j + "_vocabulary", v.getName()); + j++; + } + prop.put("sidebarVocabulary", j); + + // get segment + Segment indexSegment = sb.index; + + final String EXT = header.get("EXT", ""); + final boolean rss = EXT.equals("rss"); + final boolean json = EXT.equals("json"); + prop.put("promoteSearchPageGreeting", promoteSearchPageGreeting); + prop.put( + "promoteSearchPageGreeting.homepage", + sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, "")); + prop.put( + "promoteSearchPageGreeting.smallImage", + sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, "")); + if ( post == null || indexSegment == null || env == null || !searchAllowed ) { + // we create empty entries for template strings + prop.put("searchagain", "0"); + prop.put("former", ""); + prop.put("count", "10"); + prop.put("offset", "0"); + prop.put("resource", "global"); + prop.put("urlmaskfilter", (post == null) ? ".*" : post.get("urlmaskfilter", ".*")); + prop.put("prefermaskfilter", (post == null) ? "" : post.get("prefermaskfilter", "")); + prop.put("tenant", (post == null) ? "" : post.get("tenant", "")); + prop.put("indexof", "off"); + prop.put("constraint", ""); + prop.put("cat", "href"); + prop.put("depth", "0"); + prop.put( + "search.verify", + (post == null) ? sb.getConfig("search.verify", "iffresh") : post.get("verify", "iffresh")); + prop.put( + "search.navigation", + (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all")); + prop.put("contentdom", "text"); + prop.put("contentdomCheckText", "1"); + prop.put("contentdomCheckAudio", "0"); + prop.put("contentdomCheckVideo", "0"); + prop.put("contentdomCheckImage", "0"); + prop.put("contentdomCheckApp", "0"); + prop.put("excluded", "0"); + prop.put("results", ""); + prop.put("resultTable", "0"); + prop.put("num-results", searchAllowed ? "0" : "4"); + prop.put("num-results_totalcount", 0); + prop.put("num-results_offset", 0); + prop.put("num-results_itemsPerPage", 10); + prop.put("geoinfo", "0"); + prop.put("rss_queryenc", ""); + prop.put("meanCount", 5); + return prop; + } + + // check for JSONP + if ( post.containsKey("callback") ) { + final String jsonp = post.get("callback") + "(["; + prop.put("jsonp-start", jsonp); + prop.put("jsonp-end", "])"); + } else { + prop.put("jsonp-start", ""); + prop.put("jsonp-end", ""); + } + + // Adding CORS Access header for yacysearch.rss output + if ( rss ) { + final ResponseHeader outgoingHeader = new ResponseHeader(200); + outgoingHeader.put(HeaderFramework.CORS_ALLOW_ORIGIN, "*"); + prop.setOutgoingHeader(outgoingHeader); + } + + // collect search attributes + + int itemsPerPage = + Math.min( + (authenticated) + ? (snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() + ? 100 + : 5000) : (snippetFetchStrategy != null + && snippetFetchStrategy.isAllowedToFetchOnline() ? 20 : 1000), + post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative + int startRecord = post.getInt("startRecord", post.getInt("offset", 0)); + + boolean global = post.get("resource", "local").equals("global") && sb.peers.sizeConnected() > 0; + final boolean indexof = (post != null && post.get("indexof", "").equals("on")); + + String prefermask = (post == null) ? "" : post.get("prefermaskfilter", ""); + if ( !prefermask.isEmpty() && prefermask.indexOf(".*", 0) < 0 ) { + prefermask = ".*" + prefermask + ".*"; + } + + Bitfield constraint = + (post != null && post.containsKey("constraint") && !post.get("constraint", "").isEmpty()) + ? new Bitfield(4, post.get("constraint", "______")) + : null; + if ( indexof ) { + constraint = new Bitfield(4); + constraint.set(Condenser.flag_cat_indexof, true); + } + + // SEARCH + final boolean clustersearch = sb.isRobinsonMode() && sb.getConfig(SwitchboardConstants.CLUSTER_MODE, "").equals(SwitchboardConstants.CLUSTER_MODE_PUBLIC_CLUSTER); + final boolean indexReceiveGranted = + sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_ALLOW, true) + || sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true) + || clustersearch; + global = global && indexReceiveGranted; // if the user does not want indexes from remote peers, it cannot be a global searchnn + final boolean intranetMode = sb.isIntranetMode() || sb.isAllIPMode(); + + // increase search statistic counter + if ( !global ) { + // we count only searches on the local peer here, because global searches + // are counted on the target peer to preserve privacy of the searcher + if ( authenticated ) { + // local or authenticated search requests are counted separately + // because they are not part of a public available peer statistic + sb.searchQueriesRobinsonFromLocal++; + } else { + // robinson-searches from non-authenticated requests are public + // and may be part of the public available statistic + sb.searchQueriesRobinsonFromRemote++; + } + } + + // find search domain + final Classification.ContentDomain contentdom = + ContentDomain.contentdomParser(post == null ? "all" : post.get("contentdom", "all")); + + // patch until better search profiles are available + if (contentdom == ContentDomain.IMAGE && (itemsPerPage == 10 || itemsPerPage == 100)) { + itemsPerPage = 64; + } else if ( contentdom != ContentDomain.IMAGE && itemsPerPage > 50 && itemsPerPage < 100 ) { + itemsPerPage = 10; + } + + // check the search tracker + TreeSet trackerHandles = sb.localSearchTracker.get(client); + if ( trackerHandles == null ) { + trackerHandles = new TreeSet(); + } + boolean block = false; + if ( Domains.matchesList(client, sb.networkBlacklist) ) { + global = false; + if ( snippetFetchStrategy != null ) { + snippetFetchStrategy = null; + } + block = true; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: BLACKLISTED CLIENT FROM " + + client + + " gets no permission to search"); + } else if ( Domains.matchesList(client, sb.networkWhitelist) ) { + Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM " + + client + + " gets no search restrictions"); + } else if ( !authenticated && !localhostAccess && !intranetMode ) { + // in case that we do a global search or we want to fetch snippets, we check for DoS cases + synchronized ( trackerHandles ) { + final int accInThreeSeconds = + trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 3000)).size(); + final int accInOneMinute = + trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 60000)).size(); + final int accInTenMinutes = + trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size(); + // protections against too strong YaCy network load, reduces remote search + if ( global ) { + if ( accInTenMinutes >= 60 || accInOneMinute >= 6 || accInThreeSeconds >= 1 ) { + global = false; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + + client + + ": " + + accInThreeSeconds + + "/3s, " + + accInOneMinute + + "/60s, " + + accInTenMinutes + + "/600s, " + + " requests, disallowed global search"); + } + } + // protection against too many remote server snippet loads (protects traffic on server) + if ( snippetFetchStrategy != null && snippetFetchStrategy.isAllowedToFetchOnline() ) { + if ( accInTenMinutes >= 20 || accInOneMinute >= 4 || accInThreeSeconds >= 1 ) { + snippetFetchStrategy = CacheStrategy.CACHEONLY; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + + client + + ": " + + accInThreeSeconds + + "/3s, " + + accInOneMinute + + "/60s, " + + accInTenMinutes + + "/600s, " + + " requests, disallowed remote snippet loading"); + } + } + // general load protection + if ( accInTenMinutes >= 3000 || accInOneMinute >= 600 || accInThreeSeconds >= 60 ) { + block = true; + Log.logWarning("LOCAL_SEARCH", "ACCESS CONTROL: CLIENT FROM " + + client + + ": " + + accInThreeSeconds + + "/3s, " + + accInOneMinute + + "/60s, " + + accInTenMinutes + + "/600s, " + + " requests, disallowed search"); + } + } + } + + if ( !block && (post == null || post.get("cat", "href").equals("href")) ) { + String urlmask = null; + String protocol = null; + String tld = null; + String ext = null; + + // check available memory and clean up if necessary + if ( !MemoryControl.request(8000000L, false) ) { + indexSegment.fulltext().clearCache(); + SearchEventCache.cleanupEvents(false); + } + + final RankingProfile ranking = sb.getRanking(); + final StringBuilder modifier = new StringBuilder(20); + + int stp = querystring.indexOf('*'); + if (stp >= 0) { + querystring = querystring.substring(0, stp) + Segment.catchallString + querystring.substring(stp + 1); + } + if ( querystring.indexOf("/near", 0) >= 0 ) { + querystring = querystring.replace("/near", ""); + ranking.allZero(); // switch off all attributes + ranking.coeff_worddistance = RankingProfile.COEFF_MAX; + modifier.append("/near "); + } + if ( querystring.indexOf("/date", 0) >= 0 ) { + querystring = querystring.replace("/date", ""); + ranking.allZero(); // switch off all attributes + ranking.coeff_date = RankingProfile.COEFF_MAX; + modifier.append("/date "); + } + if ( querystring.indexOf("/https", 0) >= 0 ) { + querystring = querystring.replace("/https", ""); + protocol = "https"; + modifier.append("/https "); + } else if ( querystring.indexOf("/http", 0) >= 0 ) { + querystring = querystring.replace("/http", ""); + protocol = "http"; + modifier.append("/http "); + } + if ( querystring.indexOf("/ftp", 0) >= 0 ) { + querystring = querystring.replace("/ftp", ""); + protocol = "ftp"; + modifier.append("/ftp "); + } + if ( querystring.indexOf("/smb", 0) >= 0 ) { + querystring = querystring.replace("/smb", ""); + protocol = "smb"; + modifier.append("/smb "); + } + + if ( querystring.indexOf("/file", 0) >= 0 ) { + querystring = querystring.replace("/file", ""); + protocol = "file"; + modifier.append("/file "); + } + + if ( querystring.indexOf("/location", 0) >= 0 ) { + querystring = querystring.replace("/location", ""); + if ( constraint == null ) { + constraint = new Bitfield(4); + } + constraint.set(Condenser.flag_cat_haslocation, true); + modifier.append("/location "); + } + + final int lrp = querystring.indexOf("/language/", 0); + String language = ""; + if ( lrp >= 0 ) { + if ( querystring.length() >= (lrp + 12) ) { + language = querystring.substring(lrp + 10, lrp + 12); + } + querystring = querystring.replace("/language/" + language, ""); + language = language.toLowerCase(); + modifier.append("/language/").append(language).append(' '); + } + + final int inurl = querystring.indexOf("inurl:", 0); + if ( inurl >= 0 ) { + int ftb = querystring.indexOf(' ', inurl); + if ( ftb == -1 ) { + ftb = querystring.length(); + } + final String urlstr = querystring.substring(inurl + 6, ftb); + querystring = querystring.replace("inurl:" + urlstr, ""); + if ( !urlstr.isEmpty() ) { + urlmask = urlmask == null ? ".*" + urlstr + ".*" : urlmask + urlstr + ".*"; + } + modifier.append("inurl:").append(urlstr).append(' '); + } + + final int filetype = querystring.indexOf("filetype:", 0); + if ( filetype >= 0 ) { + int ftb = querystring.indexOf(' ', filetype); + if ( ftb == -1 ) { + ftb = querystring.length(); + } + ext = querystring.substring(filetype + 9, ftb); + querystring = querystring.replace("filetype:" + ext, ""); + while ( !ext.isEmpty() && ext.charAt(0) == '.' ) { + ext = ext.substring(1); + } + modifier.append("filetype:").append(ext).append(' '); + if (ext.isEmpty()) ext = null; + } + + int voc = 0; + Collection metatags = new ArrayList(1); + while ((voc = querystring.indexOf("/vocabulary/", 0)) >= 0) { + String vocabulary = ""; + int ve = querystring.indexOf(' ', voc + 12); + if (ve < 0) { + vocabulary = querystring.substring(voc); + querystring = querystring.substring(0, voc).trim(); + } else { + vocabulary = querystring.substring(voc, ve); + querystring = querystring.substring(0, voc) + querystring.substring(ve); + } + modifier.append(vocabulary).append(' '); + vocabulary = vocabulary.substring(12); + int p = vocabulary.indexOf('/'); + if (p > 0) { + String k = vocabulary.substring(0, p); + String v = vocabulary.substring(p + 1); + metatags.add(LibraryProvider.autotagging.metatag(k, v)); + } + } + + int radius = 0; + double lon = 0.0d, lat = 0.0d, rad = 0.0d; + if ((radius = querystring.indexOf("/radius/")) >= 0) { + int ve = querystring.indexOf(' ', radius + 8); + String geo = ""; + if (ve < 0) { + geo = querystring.substring(radius); + querystring = querystring.substring(0, radius).trim(); + } else { + geo = querystring.substring(radius, ve); + querystring = querystring.substring(0, radius) + querystring.substring(ve); + } + geo = geo.substring(8); + String[] sp = geo.split("/"); + if (sp.length == 3) try { + lat = Double.parseDouble(sp[0]); + lon = Double.parseDouble(sp[1]); + rad = Double.parseDouble(sp[2]); + } catch (NumberFormatException e) { + lon = 0.0d; lat = 0.0d; rad = 0.0d; + } + } + + final int site = querystring.indexOf("site:", 0); + String sitehash = null; + String sitehost = null; + if ( site >= 0 ) { + int ftb = querystring.indexOf(' ', site); + if ( ftb == -1 ) { + ftb = querystring.length(); + } + sitehost = querystring.substring(site + 5, ftb); + querystring = querystring.replace("site:" + sitehost, ""); + while ( sitehost.length() > 0 && sitehost.charAt(0) == '.' ) { + sitehost = sitehost.substring(1); + } + while ( sitehost.endsWith(".") ) { + sitehost = sitehost.substring(0, sitehost.length() - 1); + } + sitehash = DigestURI.hosthash(sitehost); + modifier.append("site:").append(sitehost).append(' '); + } + + final int heuristicBlekko = querystring.indexOf("/heuristic/blekko", 0); + if ( heuristicBlekko >= 0 ) { + querystring = querystring.replace("/heuristic/blekko", ""); + modifier.append("/heuristic/blekko "); + } + + final int heuristicTwitter = querystring.indexOf("/heuristic/twitter", 0); + if ( heuristicBlekko >= 0 ) { + querystring = querystring.replace("/heuristic/twitter", ""); + modifier.append("/heuristic/twitter "); + } + + final int authori = querystring.indexOf("author:", 0); + String author = null; + if ( authori >= 0 ) { + // check if the author was given with single quotes or without + final boolean quotes = (querystring.charAt(authori + 7) == '('); + if ( quotes ) { + int ftb = querystring.indexOf(')', authori + 8); + if (ftb == -1) ftb = querystring.length() + 1; + author = querystring.substring(authori + 8, ftb); + querystring = querystring.replace("author:(" + author + ")", ""); + modifier.append("author:(").append(author).append(") "); + } else { + int ftb = querystring.indexOf(' ', authori); + if ( ftb == -1 ) { + ftb = querystring.length(); + } + author = querystring.substring(authori + 7, ftb); + querystring = querystring.replace("author:" + author, ""); + modifier.append("author:").append(author).append(' '); + } + } + + final int tldp = querystring.indexOf("tld:", 0); + if (tldp >= 0) { + int ftb = querystring.indexOf(' ', tldp); + if (ftb == -1) ftb = querystring.length(); + tld = querystring.substring(tldp + 4, ftb); + querystring = querystring.replace("tld:" + tld, ""); + modifier.append("tld:").append(tld).append(' '); + while ( tld.length() > 0 && tld.charAt(0) == '.' ) { + tld = tld.substring(1); + } + if (tld.length() == 0) tld = null; + } + if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given + + // read the language from the language-restrict option 'lr' + // if no one is given, use the user agent or the system language as default + language = (post == null) ? language : post.get("lr", language); + if ( language.startsWith("lang_") ) { + language = language.substring(5); + } + if ( !ISO639.exists(language) ) { + // find out language of the user by reading of the user-agent string + String agent = header.get(HeaderFramework.ACCEPT_LANGUAGE); + if ( agent == null ) { + agent = System.getProperty("user.language"); + } + language = (agent == null) ? "en" : ISO639.userAgentLanguageDetection(agent); + if ( language == null ) { + language = "en"; + } + } + + // the query + final QueryGoal qg = new QueryGoal(originalquerystring, querystring.trim()); + final int maxDistance = (querystring.indexOf('"', 0) >= 0) ? qg.getAllHashes().size() - 1 : Integer.MAX_VALUE; + + // filter out stopwords + final SortedSet filtered = SetTools.joinConstructiveByTest(qg.getIncludeStrings(), Switchboard.stopwords); + if ( !filtered.isEmpty() ) { + SetTools.excludeDestructiveByTestSmallInLarge(qg.getIncludeStrings(), Switchboard.stopwords); + } + + // if a minus-button was hit, remove a special reference first + if ( post != null && post.containsKey("deleteref") ) { + try { + if ( !sb.verifyAuthentication(header) ) { + prop.authenticationRequired(); + return prop; + } + + // delete the index entry locally + final String delHash = post.get("deleteref", ""); // urlhash + indexSegment.termIndex().remove(qg.getIncludeHashes(), delHash.getBytes()); + + // make new news message with negative voting + if ( !sb.isRobinsonMode() ) { + final Map map = new HashMap(); + map.put("urlhash", delHash); + map.put("vote", "negative"); + map.put("refid", ""); + sb.peers.newsPool.publishMyNews( + sb.peers.mySeed(), + NewsPool.CATEGORY_SURFTIPP_VOTE_ADD, + map); + } + + // delete the search history since this still shows the entry + SearchEventCache.delete(delHash); + } catch ( final IOException e ) { + Log.logException(e); + } + } + + // if a plus-button was hit, create new voting message + if ( post != null && post.containsKey("recommendref") ) { + if ( !sb.verifyAuthentication(header) ) { + prop.authenticationRequired(); + return prop; + } + final String recommendHash = post.get("recommendref", ""); // urlhash + final URIMetadataNode urlentry = indexSegment.fulltext().getMetadata(UTF8.getBytes(recommendHash)); + if ( urlentry != null ) { + Document[] documents = null; + try { + documents = + sb.loader.loadDocuments( + sb.loader.request(urlentry.url(), true, false), + CacheStrategy.IFEXIST, + Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + } catch ( final IOException e ) { + } catch ( final Parser.Failure e ) { + } + if ( documents != null ) { + // create a news message + final Map map = new HashMap(); + map.put("url", urlentry.url().toNormalform(true).replace(',', '|')); + map.put("title", urlentry.dc_title().replace(',', ' ')); + map.put("description", documents[0].dc_title().replace(',', ' ')); + map.put("author", documents[0].dc_creator()); + map.put("tags", documents[0].dc_subject(' ')); + sb.peers.newsPool.publishMyNews( + sb.peers.mySeed(), + NewsPool.CATEGORY_SURFTIPP_ADD, + map); + documents[0].close(); + } + } + } + + // if a bookmarks-button was hit, create new bookmark entry + if ( post != null && post.containsKey("bookmarkref") ) { + if ( !sb.verifyAuthentication(header) ) { + prop.authenticationRequired(); + return prop; + } + final String bookmarkHash = post.get("bookmarkref", ""); // urlhash + final DigestURI url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash)); + if ( url != null ) { + try { + sb.tables.bookmarks.createBookmark( + sb.loader, + url, + YMarkTables.USER_ADMIN, + true, + "searchresult", + "/search"); + } catch ( final Throwable e ) { + } + } + } + + // check filters + try { + Pattern.compile(urlmask); + } catch ( final PatternSyntaxException ex ) { + SearchEvent.log.logWarning("Illegal URL mask, not a valid regex: " + urlmask); + prop.put("urlmaskerror", 1); + prop.putHTML("urlmaskerror_urlmask", urlmask); + urlmask = ".*"; + } + + try { + Pattern.compile(prefermask); + } catch ( final PatternSyntaxException ex ) { + SearchEvent.log.logWarning("Illegal prefer mask, not a valid regex: " + prefermask); + prop.put("prefermaskerror", 1); + prop.putHTML("prefermaskerror_prefermask", prefermask); + prefermask = ""; + } + + // do the search + final QueryParams theQuery = + new QueryParams( + qg, + modifier.toString().trim(), + maxDistance, + prefermask, + contentdom, + language, + metatags, + snippetFetchStrategy, + itemsPerPage, + startRecord, + urlmask, protocol, tld, ext, + clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted + ? QueryParams.Searchdom.GLOBAL + : QueryParams.Searchdom.LOCAL), + 20, + constraint, + true, + sitehash, + sitehost, + DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")), + author, + DigestURI.TLD_any_zone_filter, + client, + authenticated, + indexSegment, + ranking, + header.get(RequestHeader.USER_AGENT, ""), + sb.getConfigBool(SwitchboardConstants.SEARCH_VERIFY_DELETE, false) + && sb.getConfigBool(SwitchboardConstants.NETWORK_SEARCHVERIFY, false) + && sb.peers.mySeed().getFlagAcceptRemoteIndex(), + lat, lon, rad); + EventTracker.delete(EventTracker.EClass.SEARCH); + EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch( + theQuery.id(true), + SearchEventType.INITIALIZATION, + "", + 0, + 0), false); + + // tell all threads to do nothing for a specific time + sb.intermissionAllThreads(3000); + + // filter out words that appear in bluelist + theQuery.getQueryGoal().filterOut(Switchboard.blueList); + + // log + Log.logInfo( + "LOCAL_SEARCH", + "INIT WORD SEARCH: " + + theQuery.getQueryGoal().getOriginalQueryString(false) + + ":" + + QueryParams.hashSet2hashString(theQuery.getQueryGoal().getIncludeHashes()) + + " - " + + theQuery.neededResults() + + " links to be computed, " + + theQuery.itemsPerPage() + + " lines to be displayed"); + EventChannel.channels(EventChannel.LOCALSEARCH).addMessage( + new RSSMessage("Local Search Request", theQuery.getQueryGoal().getOriginalQueryString(false), "")); + final long timestamp = System.currentTimeMillis(); + + // create a new search event + if ( SearchEventCache.getEvent(theQuery.id(false)) == null ) { + theQuery.setOffset(0); // in case that this is a new search, always start without a offset + startRecord = 0; + } + final SearchEvent theSearch = + SearchEventCache.getEvent( + theQuery, + sb.peers, + sb.tables, + (sb.isRobinsonMode()) ? sb.clusterhashes : null, + false, + sb.loader, + (int) sb.getConfigLong( + SwitchboardConstants.REMOTESEARCH_MAXCOUNT_USER, + sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXCOUNT_DEFAULT, 10)), + sb.getConfigLong( + SwitchboardConstants.REMOTESEARCH_MAXTIME_USER, + sb.getConfigLong(SwitchboardConstants.REMOTESEARCH_MAXTIME_DEFAULT, 3000)), + (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_ROBINSON, 0), + (int) sb.getConfigLong(SwitchboardConstants.DHT_BURST_MULTIWORD, 0)); + + if ( startRecord == 0 ) { + if ( sitehost != null && sb.getConfigBool("heuristic.site", false) && authenticated ) { + sb.heuristicSite(theSearch, sitehost); + } + if ( (heuristicBlekko >= 0 || sb.getConfigBool("heuristic.blekko", false)) && authenticated ) { + sb.heuristicRSS("http://blekko.com/ws/$+/rss", theSearch, "blekko"); + } + if ( (heuristicTwitter >= 0 || sb.getConfigBool("heuristic.twitter", false)) && authenticated ) { + sb.heuristicRSS("http://search.twitter.com/search.rss?rpp=50&q=$", theSearch, "twitter"); + } + if (sb.getConfigBool("heuristic.opensearch", false) && authenticated) { + OpenSearchConnector.query(sb, theSearch); + } + } + + // log + Log.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + + theQuery.getQueryGoal().getOriginalQueryString(false) + + " - " + + "local_rwi_available(" + theSearch.query.local_rwi_available.get() + "), " + + "local_rwi_stored(" + theSearch.query.local_rwi_stored.get() + "), " + + "local_solr_available(" + theSearch.query.local_solr_available.get() + "), " + + "local_solr_stored(" + theSearch.query.local_solr_stored.get() + "), " + + "remote_available(" + theSearch.query.remote_available.get() + "), " + + "remote_stored(" + theSearch.query.remote_stored.get() + "), " + + "remote_peerCount(" + theSearch.query.remote_peerCount.get() + "), " + + "local_sortout(" + theSearch.query.misses.size() + "), " + + (System.currentTimeMillis() - timestamp) + + " ms"); + + // prepare search statistics + theQuery.searchtime = System.currentTimeMillis() - timestamp; + theQuery.urlretrievaltime = theSearch.getURLRetrievalTime(); + theQuery.snippetcomputationtime = theSearch.getSnippetComputationTime(); + AccessTracker.add(AccessTracker.Location.local, theQuery); + + // check suggestions + final int meanMax = (post != null) ? post.getInt("meanCount", 0) : 0; + + prop.put("meanCount", meanMax); + if ( meanMax > 0 && !json && !rss ) { + final DidYouMean didYouMean = new DidYouMean(indexSegment, new StringBuilder(querystring)); + final Iterator meanIt = didYouMean.getSuggestions(100, 5).iterator(); + int meanCount = 0; + String suggestion; + try { + meanCollect: while ( meanCount < meanMax && meanIt.hasNext() ) { + try { + suggestion = meanIt.next().toString(); + prop.put("didYouMean_suggestions_" + meanCount + "_word", suggestion); + prop.put( + "didYouMean_suggestions_" + meanCount + "_url", + QueryParams.navurl( + "html", + 0, + theQuery, + suggestion).toString()); + prop.put("didYouMean_suggestions_" + meanCount + "_sep", "|"); + meanCount++; + } catch (ConcurrentModificationException e) {break meanCollect;} + } + } catch (ConcurrentModificationException e) {} + prop.put("didYouMean_suggestions_" + (meanCount - 1) + "_sep", ""); + prop.put("didYouMean", meanCount > 0 ? 1 : 0); + prop.put("didYouMean_suggestions", meanCount); + } else { + prop.put("didYouMean", 0); + } + + // find geographic info + final SortedSet coordinates = LibraryProvider.geoLoc.find(originalquerystring, false); + if ( coordinates == null || coordinates.isEmpty() || startRecord > 0 ) { + prop.put("geoinfo", "0"); + } else { + int i = 0; + for ( final GeoLocation c : coordinates ) { + prop.put("geoinfo_loc_" + i + "_lon", Math.round(c.lon() * 10000.0f) / 10000.0f); + prop.put("geoinfo_loc_" + i + "_lat", Math.round(c.lat() * 10000.0f) / 10000.0f); + prop.put("geoinfo_loc_" + i + "_name", c.getName()); + i++; + if ( i >= 10 ) { + break; + } + } + prop.put("geoinfo_loc", i); + prop.put("geoinfo", "1"); + } + + // update the search tracker + try { + synchronized ( trackerHandles ) { + trackerHandles.add(theQuery.starttime); + while ( trackerHandles.size() > 600 ) { + if ( !trackerHandles.remove(trackerHandles.first()) ) { + break; + } + } + } + sb.localSearchTracker.put(client, trackerHandles); + if ( sb.localSearchTracker.size() > 100 ) { + sb.localSearchTracker.remove(sb.localSearchTracker.keys().nextElement()); + } + if ( MemoryControl.shortStatus() ) { + sb.localSearchTracker.clear(); + } + } catch ( final Exception e ) { + Log.logException(e); + } + + prop.put("num-results_offset", startRecord == 0 ? 0 : startRecord + 1); + prop.put("num-results_itemscount", Formatter.number(startRecord + theSearch.query.itemsPerPage > theSearch.query.getResultCount() ? startRecord + theSearch.query.getResultCount() % theSearch.query.itemsPerPage : startRecord + theSearch.query.itemsPerPage, true)); + prop.put("num-results_itemsPerPage", Formatter.number(itemsPerPage)); + prop.put("num-results_totalcount", Formatter.number(theSearch.query.getResultCount())); + prop.put("num-results_globalresults", global && (indexReceiveGranted || clustersearch) ? "1" : "0"); + prop.put("num-results_globalresults_localResourceSize", Formatter.number(theSearch.query.local_rwi_available.get() + theSearch.query.local_solr_available.get(), true)); + prop.put("num-results_globalresults_localMissCount", Formatter.number(theSearch.query.misses.size(), true)); + prop.put("num-results_globalresults_remoteResourceSize", Formatter.number(theSearch.query.remote_available.get(), true)); + prop.put("num-results_globalresults_remoteIndexCount", Formatter.number(theSearch.query.remote_stored.get(), true)); + prop.put("num-results_globalresults_remotePeerCount", Formatter.number(theSearch.query.remote_peerCount.get(), true)); + + // compose page navigation + final StringBuilder resnav = new StringBuilder(200); + final int thispage = startRecord / theQuery.itemsPerPage(); + if ( thispage == 0 ) { + resnav + .append("\"arrowleft\" "); + } else { + resnav.append("\"arrowleft\" "); + } + final int numberofpages = Math.min(10, 1 + ((theSearch.query.getResultCount() - 1) / theQuery.itemsPerPage())); + + for ( int i = 0; i < numberofpages; i++ ) { + if ( i == thispage ) { + resnav.append("\"page"); "); + } else { + resnav.append("\"page"); "); + } + } + if ( thispage >= numberofpages ) { + resnav + .append("\"arrowright\""); + } else { + resnav.append("\"arrowright\""); + } + final String resnavs = resnav.toString(); + prop.put("num-results_resnav", resnavs); + prop.put("pageNavBottom", (theSearch.query.getResultCount() - startRecord > 6) ? 1 : 0); // if there are more results than may fit on the page we add a navigation at the bottom + prop.put("pageNavBottom_resnav", resnavs); + + // generate the search result lines; the content will be produced by another servlet + for ( int i = 0; i < theQuery.itemsPerPage(); i++ ) { + prop.put("results_" + i + "_item", startRecord + i); + prop.put("results_" + i + "_eventID", theQuery.id(false)); + } + prop.put("results", theQuery.itemsPerPage()); + prop + .put( + "resultTable", + (contentdom == ContentDomain.APP || contentdom == ContentDomain.AUDIO || contentdom == ContentDomain.VIDEO) + ? 1 + : 0); + prop.put("eventID", theQuery.id(false)); // for bottomline + + // process result of search + if ( !filtered.isEmpty() ) { + prop.put("excluded", "1"); + prop.putHTML("excluded_stopwords", filtered.toString()); + } else { + prop.put("excluded", "0"); + } + + if ( prop == null || prop.isEmpty() ) { + if ( post.get("query", post.get("search", "")).length() < 2 ) { + prop.put("num-results", "2"); // no results - at least 2 chars + } else { + prop.put("num-results", "1"); // no results + } + } else { + prop.put("num-results", "3"); + } + + prop.put("cat", "href"); + prop.put("depth", "0"); + + // adding some additional properties needed for the rss feed + String hostName = header.get("Host", Domains.LOCALHOST); + if ( hostName.indexOf(':', 0) == -1 ) { + hostName += ":" + serverCore.getPortNr(env.getConfig("port", "8090")); + } + prop.put("searchBaseURL", "http://" + hostName + "/yacysearch.html"); + prop.put("rssYacyImageURL", "http://" + hostName + "/env/grafics/yacy.gif"); + prop.put("thisaddress", hostName); + } + + prop.put("searchagain", global ? "1" : "0"); + prop.putHTML("former", originalquerystring); + prop.put("count", itemsPerPage); + prop.put("offset", startRecord); + prop.put("resource", global ? "global" : "local"); + prop.putHTML("prefermaskfilter", prefermask); + prop.put("indexof", (indexof) ? "on" : "off"); + prop.put("constraint", (constraint == null) ? "" : constraint.exportB64()); + prop.put("search.verify", snippetFetchStrategy == null + ? sb.getConfig("search.verify", "iffresh") + : snippetFetchStrategy.toName()); + prop.put( + "search.navigation", + (post == null) ? sb.getConfig("search.navigation", "all") : post.get("nav", "all")); + prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text"))); + prop.put( + "searchdomswitches", + sb.getConfigBool("search.text", true) + || sb.getConfigBool("search.audio", true) + || sb.getConfigBool("search.video", true) + || sb.getConfigBool("search.image", true) + || sb.getConfigBool("search.app", true) ? 1 : 0); + prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0); + prop.put("searchdomswitches_searchaudio", sb.getConfigBool("search.audio", true) ? 1 : 0); + prop.put("searchdomswitches_searchvideo", sb.getConfigBool("search.video", true) ? 1 : 0); + prop.put("searchdomswitches_searchimage", sb.getConfigBool("search.image", true) ? 1 : 0); + prop.put("searchdomswitches_searchapp", sb.getConfigBool("search.app", true) ? 1 : 0); + prop.put("searchdomswitches_searchtext_check", (contentdom == ContentDomain.TEXT || contentdom == ContentDomain.ALL) ? "1" : "0"); + prop.put("searchdomswitches_searchaudio_check", (contentdom == ContentDomain.AUDIO) ? "1" : "0"); + prop.put("searchdomswitches_searchvideo_check", (contentdom == ContentDomain.VIDEO) ? "1" : "0"); + prop.put("searchdomswitches_searchimage_check", (contentdom == ContentDomain.IMAGE) ? "1" : "0"); + prop.put("searchdomswitches_searchapp_check", (contentdom == ContentDomain.APP) ? "1" : "0"); + + // copy properties for "more options" link + prop.put("searchdomswitches_count", prop.get("count")); + prop.put("searchdomswitches_urlmaskfilter", prop.get("urlmaskfilter")); + prop.put("searchdomswitches_prefermaskfilter", prop.get("prefermaskfilter")); + prop.put("searchdomswitches_cat", prop.get("cat")); + prop.put("searchdomswitches_constraint", prop.get("constraint")); + prop.put("searchdomswitches_contentdom", prop.get("contentdom")); + prop.put("searchdomswitches_former", prop.get("former")); + prop.put("searchdomswitches_meanCount", prop.get("meanCount")); + + // for RSS: don't HTML encode some elements + prop.putXML("rss_query", originalquerystring); + prop.putXML("rss_queryenc", originalquerystring.replace(' ', '+')); + + sb.localSearchLastAccess = System.currentTimeMillis(); + + // hostname and port (assume locahost if nothing helps) + final InetAddress hostIP = Domains.myPublicLocalIP(); + prop.put("myhost", hostIP != null ? hostIP.getHostAddress() : Domains.LOCALHOST); + prop.put("myport", serverCore.getPortNr(sb.getConfig("port", "8090"))); + + // return rewrite properties + return prop; + } +} diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java new file mode 100644 index 000000000..59411c7c3 --- /dev/null +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -0,0 +1,275 @@ +/** + * OpenSearchConnector + * Copyright 2012 by Michael Peter Christen + * First released 03.11.2012 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ +package net.yacy.cora.federate.opensearch; + +import java.io.*; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.*; +import net.yacy.cora.federate.solr.YaCySchema; +import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; +import net.yacy.cora.federate.yacy.ConfigurationSet; +import net.yacy.cora.federate.yacy.ConfigurationSet.Entry; +import net.yacy.cora.util.SpaceExceededException; +import net.yacy.document.parser.xml.opensearchdescriptionReader; +import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.logging.Log; +import net.yacy.search.Switchboard; +import net.yacy.search.query.SearchEvent; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +/** + * Handling of queries to remote OpenSearch systems. Iterates to a list of + * configured systems until number of needed results are available. Uses a + * temporary work table to store search template urls for the iteration during + * search. + */ +public class OpenSearchConnector { + + private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf + private int size = 0; // remember the size of active opensearch targets + + public OpenSearchConnector(Switchboard sb, boolean createworktable) { + super(); + if (sb == null) { + return; + } + + confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + + if (createworktable) { // read from config file and create worktable + sb.tables.clear("opensearchsys"); + try { + ConfigurationSet cfg = new ConfigurationSet(confFile); + + // copy active opensearch systems to a work table (opensearchsys) + Iterator cfgentries = cfg.entryIterator(); + while (cfgentries.hasNext()) { + Entry e = cfgentries.next(); + if (e.enabled()) { + String title = e.key(); // get the title + String urlstr = e.getValue(); // get the search template url + + Tables.Data row = new Tables.Data(); + row.put("title", title); + row.put("url", urlstr); + try { + sb.tables.insert("opensearchsys", row); + } catch (SpaceExceededException ex) { + Log.logException(ex); + } + } + } + size = sb.tables.size("opensearchsys"); + } catch (IOException ex) { + Log.logException(ex); + } + } + } + + /** + * Sends a search request to remote systems listed in worktable until the + * searchevent contains less than needed results. Depending on already + * collected search results none to all configured systems are queried to + * complete available search results. + * if query search domain is LOCAL procedure does nothing. + */ + static public void query(Switchboard sb, SearchEvent theSearch) { + if (theSearch != null && sb != null) { + if (!theSearch.query.isLocal()) { + try { + Iterator ossysworktable = sb.tables.iterator("opensearchsys"); + int needres = theSearch.query.neededResults(); // get number of needed results + while (ossysworktable.hasNext() && theSearch.query.getResultCount() < needres) { + Tables.Row row = ossysworktable.next(); + String osurl = row.get("url", ""); + String name = row.get("title", ""); + // to reuse existing heuristicRSS procedure replace querystring with "$" + // querystring is inserted/replaced inside heuristicRSS + sb.heuristicRSS(parseSearchTemplate(osurl, "$", 0, theSearch.query.itemsPerPage), theSearch, "opensearch:" + name); + } + } catch (IOException ex) { + Log.logWarning("OpenSearchConnector.query", "failed reading table opensearchsys"); + } + } + } + } + + /** + * replace Opensearchdescription search template parameter with actual values + */ + private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { + String tmps = searchurltemplate.replaceAll("\\?}=", "}="); // some optional parameters may include question mark '{param?}=' + tmps = tmps.replace("{startIndex}", Integer.toString(start)); + tmps = tmps.replace("{startPage}", ""); + tmps = tmps.replace("{count}", Integer.toString(rows)); + tmps = tmps.replace("{language}", ""); + tmps = tmps.replace("{inputEncoding}", "UTF-8"); + tmps = tmps.replace("{outputEncoding}", "UTF-8"); + return tmps.replace("{searchTerms}", query); + } + + /** + * add a opensearch target system to the config file + */ + public boolean add(String name, String url, boolean active, String comment) { + if (confFile == null) { + return false; + } + + ConfigurationSet conf = new ConfigurationSet(confFile); + if (name != null && !name.isEmpty()) { + conf.add(name, null, active); + Entry e = conf.get(name); + e.setValue(url); + e.setEnable(active); + e.setComment(comment); + conf.put(name, e); + try { + conf.commit(); + } catch (IOException ex) { + Log.logWarning("OpenSearchConnector.add", "config file write error"); + } + return true; + } else { + return false; + } + } + + /** + * Get the number of active remote opensearch target systems + */ + public int getSize() { + return size; + } + + /** + * Discover opensearch description links from local (embedded) Solr index using + * meta data field 'outboundlinks_tag_txt' and add found systems to the + * config file + */ + public boolean discoverFromSolrIndex(final Switchboard sb) { + if (sb == null) { + return false; + } + final EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.fulltext().getLocalSolr(); + // check if needed Solr fields are available (selected) + if (connector == null) { + Log.logSevere("OpenSearchConnector.Discover", "Error on connecting to embedded Solr index"); + return false; + } + final boolean metafieldNOTavailable = sb.index.fulltext().getSolrScheme().containsDisabled(YaCySchema.outboundlinks_tag_txt.name()); + if (metafieldNOTavailable) { + Log.logWarning("OpenSearchConnector.Discover", "Solr Schema field outboundlinks_tag_txt must be switched on"); + return false; + } + // the solr query + final String solrquerystr = YaCySchema.outboundlinks_tag_txt.getSolrFieldName() + ":\"rel=\\\"search\\\"\" OR " + + YaCySchema.inboundlinks_tag_txt.getSolrFieldName() + ":\"rel=\\\"search\\\"\"&fl=" + + YaCySchema.sku.getSolrFieldName() + "," + YaCySchema.outboundlinks_tag_txt.getSolrFieldName() +"," + YaCySchema.inboundlinks_tag_txt.getSolrFieldName(); + final long numfound; + try { + SolrDocumentList docList = connector.query(solrquerystr, 0, 1); + numfound = docList.getNumFound(); + if (numfound == 0) { + Log.logInfo("OpenSearchConnector.Discover", "no results found, abort discover job"); + return false; + } else { + Log.logInfo("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results"); + } + } catch (IOException ex) { + Log.logException(ex); + return false; + } + + final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever + + // job to iterate through Solr index to find links to opensearchdescriptions + // started as background job as connect timeouts may cause it run a long time + final Thread job = new Thread() { + @Override + public void run() { + try { + boolean doloop = true; + int loopnr = 0; + Set dblmem = new HashSet(); // temp memory for already checked url + while (doloop) { + Log.logInfo("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); + SolrDocumentList docList = connector.query(solrquerystr, loopnr * 20, 20); // check chunk of 20 result documents + loopnr++; + if (stoptime < System.currentTimeMillis()) {// stop after max 1h + doloop = false; + Log.logInfo("OpenSearchConnector.Discover", "long running discover task aborted"); + } + if (docList != null && docList.size() > 0) { + Iterator docidx = docList.iterator(); + while (docidx.hasNext()) { + SolrDocument sdoc = docidx.next(); + Collection tagtxtlist = sdoc.getFieldValues(YaCySchema.outboundlinks_tag_txt.getSolrFieldName()); + if (tagtxtlist == null) { + tagtxtlist = sdoc.getFieldValues(YaCySchema.inboundlinks_tag_txt.getSolrFieldName()); + } else { + tagtxtlist.addAll(sdoc.getFieldValues(YaCySchema.inboundlinks_tag_txt.getSolrFieldName())); + } + Iterator tagtxtidx = tagtxtlist.iterator(); + while (tagtxtidx.hasNext()) { + // check and extract links to opensearchdescription + // example: + String tagtxt = (String) tagtxtidx.next(); + if (tagtxt.contains("search")) { + int hrefstartpos = tagtxt.indexOf("href="); + if (hrefstartpos > 0) { + String hrefendpos = tagtxt.substring(hrefstartpos + 6); + hrefstartpos = hrefendpos.indexOf('"'); + String hrefurltxt = hrefendpos.substring(0, hrefstartpos); // hrefurltxt contains now url to opensearchdescription + try { + URL url = new URL(hrefurltxt); + //TODO: check Blacklist + if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries + opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt); + if (os.getRSSorAtomUrl() != null) { + // add found system to config file + add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName")); + Log.logInfo("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt); + } else { + Log.logInfo("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt); + } + } + } catch (MalformedURLException ex) { + } + } + } + } + } + } else { + doloop = false; + } + } + Log.logInfo("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)"); + } catch (IOException ex) { + Log.logException(ex); + } + } + }; + job.start(); + return true; + } +} diff --git a/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java b/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java index 0a1777253..a32a4aa14 100644 --- a/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java +++ b/source/net/yacy/document/parser/xml/opensearchdescriptionReader.java @@ -23,38 +23,34 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - + package net.yacy.document.parser.xml; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; - import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; - -import net.yacy.cora.document.UTF8; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.ByteBuffer; - import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; - +/* + * reads opensearchdescription xml document and provides the parsed search url + * templates via get methodes as well as all other tags by getItem(tagname) + */ public class opensearchdescriptionReader extends DefaultHandler { - // statics for item generation and automatic categorization - static int guidcount = 0; //private static final String recordTag = "OpenSearchDescription"; private static final String[] tagsDef = new String[]{ "ShortName", "LongName", - "Image", + // "Image", "Language", "OutputEncoding", "InputEncoding", @@ -97,35 +93,32 @@ public class opensearchdescriptionReader extends DefaultHandler { } // class variables - private Item channel; private final StringBuilder buffer; - private boolean parsingChannel; - private final String imageURL; - private final ArrayList itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order - private final HashMap items; // a guid:Item map - + private boolean parsingDescription, parsingTextValue; + private final HashMap items; // Opensearchdescription Item map + private String rssurl, atomurl; // search url templates public opensearchdescriptionReader() { - this.itemsGUID = new ArrayList(); - this.items = new HashMap(); + this.items = new HashMap(); this.buffer = new StringBuilder(); - this.channel = null; - this.parsingChannel = false; - this.imageURL = null; + this.parsingDescription = false; + this.parsingTextValue = false; + this.rssurl = null; + this.atomurl = null; } private static final ThreadLocal tlSax = new ThreadLocal(); private static SAXParser getParser() throws SAXException { - SAXParser parser = tlSax.get(); - if (parser == null) { - try { - parser = SAXParserFactory.newInstance().newSAXParser(); - } catch (ParserConfigurationException e) { - throw new SAXException(e.getMessage(), e); - } - tlSax.set(parser); - } - return parser; + SAXParser parser = tlSax.get(); + if (parser == null) { + try { + parser = SAXParserFactory.newInstance().newSAXParser(); + } catch (ParserConfigurationException e) { + throw new SAXException(e.getMessage(), e); + } + tlSax.set(parser); + } + return parser; } public opensearchdescriptionReader(final String path) { @@ -148,102 +141,105 @@ public class opensearchdescriptionReader extends DefaultHandler { } } - public static opensearchdescriptionReader parse(final byte[] a) { - - // check integrity of array - if ((a == null) || (a.length == 0)) { - Log.logWarning("opensearchdescriptionReader", "response=null"); - return null; - } - if (a.length < 100) { - Log.logWarning("opensearchdescriptionReader", "response=" + UTF8.String(a)); - return null; - } - if (!ByteBuffer.equals(a, UTF8.getBytes(" map; - - public Item() { - this.map = new HashMap(); - this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++); - } - - public void setValue(final String name, final String value) { - this.map.put(name, value); - } + public int items() { + return this.items.size(); } } \ No newline at end of file