From 24f68a4eb74437cf7abab21f0975feb46077264c Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 19 Jan 2015 03:30:35 +0100 Subject: [PATCH] refactor opensearch heuristic introduce FederateSearchManager handling search heuristic to external systems via specific FederateSearchConnectors, which provide the query() functionallity, the translation to YaCy schema .toYaCySchema() and the search() routine to deliver results to searchevents, which is generally implemented in Abstract connector. The manager enforces now a min 15s delay between calls to external systems. Besides the OpensearchConnector a SolrFederateSearchConnector is available. It uses a additional config file for fieldname translation. default heuristicopensearch.conf: - openbdb.com removed - seems not longer to deliver results - config via solrconnector to datacite.org added (large technical library archive) --- defaults/federatecfg/datacite.solr.schema | 32 ++ defaults/heuristicopensearch.conf | 9 +- htroot/ConfigHeuristics_p.java | 23 +- htroot/ConfigNetwork_p.java | 4 +- htroot/yacysearch.java | 6 +- .../AbstractFederateSearchConnector.java | 197 ++++++++ .../federate/FederateSearchConnector.java | 62 +++ .../cora/federate/FederateSearchManager.java | 427 ++++++++++++++++++ .../federate/SolrFederateSearchConnector.java | 119 +++++ .../opensearch/OpenSearchConnector.java | 296 ++++-------- .../kelondro/data/meta/URIMetadataNode.java | 21 +- source/net/yacy/search/Switchboard.java | 13 +- 12 files changed, 971 insertions(+), 238 deletions(-) create mode 100644 defaults/federatecfg/datacite.solr.schema create mode 100644 source/net/yacy/cora/federate/AbstractFederateSearchConnector.java create mode 100644 source/net/yacy/cora/federate/FederateSearchConnector.java create mode 100644 source/net/yacy/cora/federate/FederateSearchManager.java create mode 100644 source/net/yacy/cora/federate/SolrFederateSearchConnector.java diff --git a/defaults/federatecfg/datacite.solr.schema b/defaults/federatecfg/datacite.solr.schema new file mode 100644 index 000000000..11823deae --- /dev/null +++ b/defaults/federatecfg/datacite.solr.schema @@ -0,0 +1,32 @@ +## API datacite.org +## This service is also available as an API. We use Solr Search Handler for our API calls, the endpoint is: http://search.datacite.org/api. + +## Please check Solr's common query parameters documentation in order to understand how to use API. +## Examples + +## http://search.datacite.org/api?q=wind simple search for wind +## http://search.datacite.org/api?q=wind&fl=doi,title&rows=5 search for wind, retrieve only doi and title, and return (at max.) 5 results +## http://search.datacite.org/api?q=wind&fl=doi,title&wt=csv csv output +## http://search.datacite.org/api?q=wind&fl=doi,title&wt=json&indent=true json output + +## YaCy solrconnector specific settings +## the basic url to acces the system +_baseurl = http://search.datacite.org/ +## Solr core, is appended to the _baseurl +_corename = api +## some systems store a identifier instead of a url for the resource, the prefix is prepended the identifier in _skufieldname +_skuprefix = http://dx.doi.org/ +## the field name of the url of resource (in yacy/solr = sku) +_skufieldname = doi + +## field mappings +## YaCyFieldname = remoteFieldname +keywords = subject +author = creator +publisher_t = publisher +title = title +description_txt = description +language_s = language +text_t = description +size_i = size +coordinate_p = geoLocationPoint \ No newline at end of file diff --git a/defaults/heuristicopensearch.conf b/defaults/heuristicopensearch.conf index 143f25a03..712674aba 100644 --- a/defaults/heuristicopensearch.conf +++ b/defaults/heuristicopensearch.conf @@ -14,8 +14,15 @@ #Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko #Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss # get results from Faroo news-search -#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 #Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web + +## In addition to OpenSearch systems other connectors are available to query foreign systems +## the syntax is +## SystemName = cfgfile:_connectortype_:_schemaconfig_ +## where cfgfile: is a fix prefix (to signal this is not a opensearch url) +## _connectortype_ is the type of connector to use ( available is solrconnector ) +## _schemaconfig_ is the config file with filed name mappings (the file has to exist in DATA/SETTINGS/federatecfg +#datacite.org = cfgfile:solrconnector:datacite.solr.schema # International Consortium for data citation diff --git a/htroot/ConfigHeuristics_p.java b/htroot/ConfigHeuristics_p.java index a171ea078..6bd9768c9 100644 --- a/htroot/ConfigHeuristics_p.java +++ b/htroot/ConfigHeuristics_p.java @@ -25,7 +25,6 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -import com.google.common.io.Files; import java.io.File; @@ -37,9 +36,10 @@ import net.yacy.search.Switchboard; import java.io.IOException; import java.util.Iterator; +import net.yacy.cora.federate.FederateSearchManager; -import net.yacy.cora.federate.opensearch.OpenSearchConnector; import net.yacy.cora.federate.solr.SchemaConfiguration; +import net.yacy.cora.storage.Files; import net.yacy.search.SwitchboardConstants; import net.yacy.search.schema.WebgraphSchema; import net.yacy.server.serverObjects; @@ -66,9 +66,9 @@ public class ConfigHeuristics_p { if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false); if (post.containsKey("opensearch_on")) { sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true); - // re-read config (and create work table) - OpenSearchConnector os = new OpenSearchConnector(sb, true); - if (os.getSize() == 0) { + // re-read config + FederateSearchManager.getManager().init(sb.getDataPath().getAbsolutePath()+ "DATA/SETTINGS/heuristicopensearch.conf"); + if (FederateSearchManager.getManager().getSize() == 0) { osderrmsg = "no active search targets are configured"; } } @@ -77,8 +77,8 @@ public class ConfigHeuristics_p { final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())); if (metafieldavailable) { - OpenSearchConnector osc = new OpenSearchConnector(sb, false); - if (osc.discoverFromSolrIndex(sb)) { + //OpenSearchConnector osc = new OpenSearchConnector(sb, false); + if (FederateSearchManager.getManager().discoverFromSolrIndex(sb)) { osderrmsg = "started background search for target systems, refresh page after some minutes"; } else { osderrmsg = "Error: webgraph Solr index not enabled"; @@ -98,8 +98,7 @@ public class ConfigHeuristics_p { if (tmpname != null && tmpurl !=null) { if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) { final String tmpcomment = post.get("ossys_newcomment"); - OpenSearchConnector osc = new OpenSearchConnector(sb,false); - osc.add (tmpname,tmpurl,false,tmpcomment); + FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment); } else osderrmsg = "Url template must contain '{searchTerms}'"; } } @@ -143,6 +142,10 @@ public class ConfigHeuristics_p { if ((post.containsKey("resettodefaultosdlist") || !osdConfig.exists()) && osdDefaultConfig.exists()) { try { Files.copy(osdDefaultConfig, osdConfig); + File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg"); + if (!defdir.exists()) { + Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir); + } } catch (final IOException ex) { osderrmsg = "file I/O error during copy"; } @@ -240,7 +243,7 @@ public class ConfigHeuristics_p { // re-read config (and create/update work table) if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) { - new OpenSearchConnector(sb, true); + FederateSearchManager.getManager().init(f.getAbsolutePath()); } } } diff --git a/htroot/ConfigNetwork_p.java b/htroot/ConfigNetwork_p.java index 2ca5a194b..97e408c8c 100644 --- a/htroot/ConfigNetwork_p.java +++ b/htroot/ConfigNetwork_p.java @@ -127,8 +127,8 @@ public class ConfigNetwork_p sb.peers.mySeed().setPeerTags(MapTools.string2set(normalizedList(post.get("peertags")), ",")); } - sb.setConfig("cluster.mode", post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); - sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", ""))); + sb.setConfig(SwitchboardConstants.CLUSTER_MODE, post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); + sb.setConfig(SwitchboardConstants.CLUSTER_PEERS_IPPORT, checkIPPortList(post.get(SwitchboardConstants.CLUSTER_PEERS_IPPORT, ""))); sb.setConfig( "cluster.peers.yacydomain", checkYaCyDomainList(post.get("cluster.peers.yacydomain", ""))); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 24e7688d0..4743c59f7 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -45,7 +45,7 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; -import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import net.yacy.cora.federate.FederateSearchManager; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; @@ -719,10 +719,10 @@ public class yacysearch { sb.heuristicSite(theSearch, modifier.sitehost); } if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) { - OpenSearchConnector.query(sb, theSearch); + FederateSearchManager.getManager().search(theSearch); } if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { - OpenSearchConnector.query(sb, theSearch); + FederateSearchManager.getManager().search(theSearch); } } diff --git a/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java new file mode 100644 index 000000000..48fd05134 --- /dev/null +++ b/source/net/yacy/cora/federate/AbstractFederateSearchConnector.java @@ -0,0 +1,197 @@ +/** + * AbstractFederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import javax.servlet.http.HttpServletResponse; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.SchemaConfiguration; +import net.yacy.cora.federate.solr.SchemaDeclaration; +import net.yacy.cora.federate.solr.SolrType; +import net.yacy.cora.sorting.ReversibleScoreMap; +import net.yacy.cora.storage.Configuration; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.Switchboard; +import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.CollectionSchema; +import org.apache.solr.common.SolrDocument; + +/** + * Base implementation class for Federated Search Connectors providing the basic + * funcitonality to search none YaCy systems + * + * Subclasses should/need to override query() and maybe toYaCySchema() if more + * is needed as a basic field mapping + */ +abstract public class AbstractFederateSearchConnector implements FederateSearchConnector { + + public String instancename; // just a identifying name + protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname + public long lastaccesstime = -1; // last time accessed, used for search delay calculation + protected String baseurl; + + /** + * Inits the connector with the remote field names and matches to yacy + * schema and other specific settings from config file. Every connector + * needs at least a query target (where to query) and some definition to + * convert the remote serch result to the internal result presentation + * (field mapping) + * + * @param instanceName internal name + * @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA + * @return true if success false if not + */ + @Override + public boolean init(String instance, String cfgFileName) { + this.instancename = instance; + File instanceCfgFile = new File(cfgFileName); + if (instanceCfgFile.exists()) { + try { + this.localcfg = new SchemaConfiguration(instanceCfgFile); + } catch (IOException ex) { + ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName); + return false; + } + // mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url + if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) { + return true; + } else { + ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName); + return false; + } + } else { + this.localcfg = null; + return false; + } + } + + /** + * queries a remote system and adds the results to the searchevent and to + * the crawler if addResultsToLocalIndex is true + * + * @param theSearch receiving the results + */ + @Override + public void search(final SearchEvent theSearch) { + + final Thread job = new Thread() { + @Override + public void run() { + Thread.currentThread().setName("heuristic:" + instancename); + theSearch.oneFeederStarted(); + List doclist = query(theSearch.getQuery()); + if (doclist != null) { + Map> snippets = new HashMap>(); // add nodes doesn't allow null + Map> facets = new HashMap>(); // add nodes doesn't allow null + theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size()); + + for (URIMetadataNode doc : doclist) { + theSearch.addHeuristic(doc.hash(), instancename, false); + } + } + // that's all we need to display serach result + theSearch.oneFeederTerminated(); + + // optional: add to crawler to get the full resource (later) + if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) { + Collection urls = new ArrayList(); + for (URIMetadataNode doc : doclist) { + urls.add(doc.url()); + } + Switchboard.getSwitchboard().addToCrawler(urls, false); + + } + } + }; + job.start(); + } + + /** + * Converts a remote schema result to YaCy schema using the fieldname + * mapping provided as config file + * + * @param remote result (with remote fieldnames) + * @return SolrDocument with field names according to the YaCy schema + */ + protected URIMetadataNode toYaCySchema(final SolrDocument doc) { + // set YaCy id + String urlstr; + if (localcfg.contains("sku")) { + urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue()); + } else { + urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue()); + if (this.localcfg.contains("_skuprefix")) { + String skuprefix = this.localcfg.get("_skuprefix").getValue(); + urlstr = skuprefix + urlstr; + } + } + + URIMetadataNode newdoc = new URIMetadataNode(urlstr); + Iterator it = localcfg.entryIterator(); + while (it.hasNext()) { + Configuration.Entry et = it.next(); + String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname + String remotefieldname = et.getValue(); + if (remotefieldname != null && !remotefieldname.isEmpty()) { + if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config + + SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname); + if (est.isMultiValued()) { + if (doc.getFieldValues(remotefieldname) != null) { + newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); // + } + } else { + if (doc.getFieldValue(remotefieldname) != null) { + Object val = doc.getFirstValue(remotefieldname); + // watch out for type conversion + try { + if (est.getType() == SolrType.num_integer && val instanceof String) { + newdoc.setField(yacyfieldname, Integer.parseInt((String) val)); + } else { + newdoc.setField(yacyfieldname, val); + } + } catch (Exception ex) { + continue; // catch possible parse or type mismatch, skip the field + } + } + } + } + } + } + + newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required + return newdoc; + } +} diff --git a/source/net/yacy/cora/federate/FederateSearchConnector.java b/source/net/yacy/cora/federate/FederateSearchConnector.java new file mode 100644 index 000000000..4f7ebfa8e --- /dev/null +++ b/source/net/yacy/cora/federate/FederateSearchConnector.java @@ -0,0 +1,62 @@ +/** + * FederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.util.List; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import net.yacy.search.query.SearchEvent; + + +/** + * Interface for a query connector to search and gather query results from none + * YaCy systems (for the YaCy heuristic options) + */ +public interface FederateSearchConnector { + + /** + * Load the configuration for this connector every connector needs at least + * a query target (where to query) and some definition to convert the remote + * serch result to the internal result presentation (field mapping) + * + * @param instanceName is also the name of the config file DATA/SETTINGS/instanceName.schema + * @param cfg config parameter + * @return true if success false if not + */ + abstract boolean init(String instanceName, String cfg); + + /** + * Queries a remote system and adds the result metadata to the search events + * result list. If SearchEvent.addResultsToLocalIndex (=default) result urls + * are added to the crawler. + * @param theSearch + */ + abstract void search(SearchEvent theSearch); + + /** + * Queries a remote system and returns the search result with field names + * according to YaCy schema. + * + * @param query + * @return result (metadata) in YaCy schema format + */ + abstract List query(QueryParams query); + +} diff --git a/source/net/yacy/cora/federate/FederateSearchManager.java b/source/net/yacy/cora/federate/FederateSearchManager.java new file mode 100644 index 000000000..8d2ba017d --- /dev/null +++ b/source/net/yacy/cora/federate/FederateSearchManager.java @@ -0,0 +1,427 @@ +/** + * FederateSearchManager.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import net.yacy.cora.federate.opensearch.OpenSearchConnector; +import java.io.File; +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; + +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; +import net.yacy.cora.document.analysis.Classification; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.storage.Configuration; +import net.yacy.cora.storage.Configuration.Entry; +import net.yacy.cora.storage.Files; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.parser.xml.opensearchdescriptionReader; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.kelondro.util.Bitfield; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.search.query.QueryGoal; +import net.yacy.search.query.QueryModifier; +import net.yacy.search.query.QueryParams; +import net.yacy.search.query.SearchEvent; +import net.yacy.search.schema.WebgraphSchema; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; + +/** + * Handling of queries to configured remote OpenSearch systems. + */ +public class FederateSearchManager { + + private final int accessDelay = 15000; // delay between connects (in ms) + + private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf + private HashSet conlist; // connector list + protected Configuration cfg;//PropertiesConfiguration cfg; + private static FederateSearchManager manager = null; // self referenc for static .getManager() + + public FederateSearchManager(Switchboard sb) { + super(); + this.conlist = new HashSet(); + + // from here we need Switchboard settings + if (sb == null) { + return; + } + // Data needed active name, url(template), desc, rule-when-to-use, specifics + confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + if (!confFile.exists()) { + try { + Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile); + File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg"); + if (!defdir.exists()) { + Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir); + } + } catch (IOException ex) { + } + } + // read settings config file + if (confFile.exists()) { + try { + cfg = new Configuration(confFile); + Iterator it = cfg.entryIterator(); + while (it.hasNext()) { + Entry cfgentry = it.next(); + String url = cfgentry.getValue(); + if (cfgentry.enabled() && url != null && !url.isEmpty()) { + String name = cfgentry.key(); + if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url) + // format prefix:connectortype:configfilename + // example cfgfile:solrconnector:testsys.solr.schema + String[] parts = url.split(":"); + if (parts[1].equalsIgnoreCase("solrconnector")) { + SolrFederateSearchConnector sfc = new SolrFederateSearchConnector(); + if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) { + conlist.add(sfc); + } + } else { + ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url); + } + } else { // handle opensearch url template + OpenSearchConnector osc = new OpenSearchConnector(); + if (osc.init(name, url)) { + conlist.add(osc); + } + } + } + } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + } + } + manager = this; // reference for static access via .getManager() + } + + /** + * Get instance of this manager. There should be only one instance running, + * use this to get or initialize the manager. + * + * @return + */ + public static FederateSearchManager getManager() { + if (manager == null) { + manager = new FederateSearchManager(Switchboard.getSwitchboard()); + } + return manager; + } + + /** + * Sends a query request to remote systems configured. + * If search query domain is LOCAL procedure does nothing. + * + * @param theSearch + */ + public void search(SearchEvent theSearch) { + if (theSearch != null) { + if (!theSearch.query.isLocal()) { + Set picklist = getBest(theSearch.getQuery()); + for (AbstractFederateSearchConnector fsc : picklist) { + fsc.search(theSearch); + } + } + } + } + + /** + * Sends a query to configured remote systems. + * + * @param query + * @return list of results according to YaCy schema + */ + public List query(QueryParams query) { + if (query.isLocal()) { + List sdl = new ArrayList(); + Set picklist = getBest(query); + for (AbstractFederateSearchConnector fsc : picklist) { + sdl.addAll(fsc.query(query)); + } + return sdl; + } else { + return null; + } + } + + /** + * Takes a search string, converts it to queryparams and calls the + * query(queryparams) + * + * @param querystr + * @return SolrDocumentlist of remote query results according to YaCy schema + */ + public List query(String querystr) { + + final QueryGoal qg = new QueryGoal(querystr); + final Switchboard sb = Switchboard.getSwitchboard(); + Bitfield filter = new Bitfield(); + final QueryParams query = new QueryParams( + qg, + new QueryModifier(), + Integer.MAX_VALUE, + "", + Classification.ContentDomain.ALL, + "", //lang + null, + CacheStrategy.IFFRESH, + 100, 0, //count, offset + ".*", //urlmask + null, + null, + QueryParams.Searchdom.LOCAL, + filter, + false, + null, + MultiProtocolURL.TLD_any_zone_filter, + "", + false, + sb.index, + sb.getRanking(), + "",//userAgent + false, + false, + 0.0, 0.0, -1.0, + new String[0]); + + return query(query); + } + + /** + * Add a search target system/connector to the config file + * + * @param urlTemplate query template url + * @return successful added + */ + public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) { + if (confFile == null) { + return false; + } + + try { + Configuration conf = new Configuration(confFile); + if (name != null && !name.isEmpty()) { + conf.add(name, null, active); + Configuration.Entry e = conf.get(name); + e.setValue(urlTemplate); + e.setEnable(active); + e.setComment(comment); + conf.put(name, e); + try { + conf.commit(); + if (active) { + OpenSearchConnector osd = new OpenSearchConnector(); + if (osd.init(name, urlTemplate)) { + conlist.add(osd); + } + } + } catch (final IOException ex) { + ConcurrentLog.warn("FederateSearchManager", "config file write error"); + } + return true; + } + } catch (final IOException e1) { + ConcurrentLog.logException(e1); + return false; + } + return false; + } + + /** + * Get the number of active remote query target systems + */ + public int getSize() { + return conlist.size(); + } + + /** + * Get best systems from configured targets for this search + * + * @param theSearch + * @return list of searchtargetconnectors + */ + protected Set getBest(final QueryParams query) { + HashSet retset = new HashSet(); + // currently only enforces limits (min access delay, frequency) + for (AbstractFederateSearchConnector fsc : conlist) { + // check access time + if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system + retset.add(fsc); + } + } + return retset; + } + + /** + * Discover opensearch description links from local (embedded) Solr index + * using meta data field 'outboundlinks_tag_txt' and add found systems to + * the config file + * + * @return true if background discover job was started, false if job not + * started + */ + public boolean discoverFromSolrIndex(Switchboard sb) { + if (sb == null) { + return false; + } + // check if needed Solr fields are available (selected) + if (!sb.index.fulltext().useWebgraph()) { + ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index"); + return false; + } + final SolrConnector connector = sb.index.fulltext().getWebgraphConnector(); + final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) + && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())) + && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); + if (!metafieldavailable) { + ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on"); + return false; + } + // the solr search + final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search"; + final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()}; + // alternatively target_protocol_s + "://" +target_host_s + target_path_s + + final long numfound; + try { + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields); + numfound = docList.getNumFound(); + if (numfound == 0) { + ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job"); + return true; + } + ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results"); + } catch (final IOException ex) { + ConcurrentLog.logException(ex); + return false; + } + + final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever + + // job to iterate through Solr index to find links to opensearchdescriptions + // started as background job as connect timeouts may cause it run a long time + final Thread job = new Thread() { + @Override + public void run() { + try { + boolean doloop = true; + int loopnr = 0; + Set dblmem = new HashSet(); // temp memory for already checked url + while (doloop) { + ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); + SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents + loopnr++; + if (stoptime < System.currentTimeMillis()) {// stop after max 1h + doloop = false; + ConcurrentLog.info("FederateSearchManager", "long running discover task aborted"); + } + if (docList != null && docList.size() > 0) { + Iterator docidx = docList.iterator(); + while (docidx.hasNext()) { + SolrDocument sdoc = docidx.next(); + + String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); + try { + URL url = new URL(hrefurltxt); + //TODO: check Blacklist + if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries + opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt); + if (os.getRSSorAtomUrl() != null) { + // add found system to config file + addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName")); + ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt); + } else { + ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt); + } + } + } catch (final MalformedURLException ex) { + } + } + } else { + doloop = false; + } + } + ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)"); + } catch (final IOException ex) { + ConcurrentLog.logException(ex); + } + } + }; + job.start(); + return true; + } + + /** + * Read or reread opensearch config file and initialize connectors + * + * @param cfgFileName + * @return true if successful + */ + public boolean init(String cfgFileName) { + confFile = new File(cfgFileName); + if (confFile.exists()) { + try { + cfg = new Configuration(confFile); + if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries + Iterator it = cfg.entryIterator(); + while (it.hasNext()) { + Entry cfgentry = it.next(); + if (cfgentry.enabled()) { // hold only enabled in memory + String name = cfgentry.key(); + String url = cfgentry.getValue(); + if (url != null && !url.isEmpty()) { + if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url) + // config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file + // example cfgfile:solrconnector:testsys.solr.schema + String[] parts = url.split(":"); + if (parts[1].equalsIgnoreCase("solrconnector")) { + SolrFederateSearchConnector sfc = new SolrFederateSearchConnector(); + if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) { + conlist.add(sfc); + } + } else { + ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url); + } + } else { // handle opensearch url template + OpenSearchConnector osd; + osd = new OpenSearchConnector(); + if (osd.init(name, url)) { + conlist.add(osd); + } + } + } + } + } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + } + } + return true; + } + +} diff --git a/source/net/yacy/cora/federate/SolrFederateSearchConnector.java b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java new file mode 100644 index 000000000..7e9fceaaa --- /dev/null +++ b/source/net/yacy/cora/federate/SolrFederateSearchConnector.java @@ -0,0 +1,119 @@ +/** + * SolrFederateSearchConnector.java + * Copyright 2015 by Burkhard Buelte + * First released 19.01.2015 at http://yacy.net + * + * This library is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) + * any later version. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt If not, see + * . + */ +package net.yacy.cora.federate; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import net.yacy.cora.federate.solr.connector.RemoteSolrConnector; +import net.yacy.cora.federate.solr.connector.SolrConnector; +import net.yacy.cora.federate.solr.instance.RemoteInstance; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; + +/** + * Search connecter to collect query results from remote Solr systems which + * provide results as Solr documents + */ +public class SolrFederateSearchConnector extends AbstractFederateSearchConnector { + + private String corename; + + @Override + public boolean init(String instance, String cfgFileName) { + boolean initResult = super.init(instance, cfgFileName); // init local schema cfg + if (initResult) { + if (this.localcfg.contains("_baseurl")) { + setBaseurl(this.localcfg.get("_baseurl").getValue()); + } else { + ConcurrentLog.config(instance, "no _baseurl given in config file "+cfgFileName); + initResult = false; + } + if (this.localcfg.contains("_corename")) { + setCoreName(this.localcfg.get("_corename").getValue()); + } else { + ConcurrentLog.config(instance, "no _corename given in config file "); // not mandatory + this.corename = ""; + } + } + return initResult; + } + + public void setBaseurl(String url) { + if (url.endsWith("/")) { + this.baseurl = url; + } else { + this.baseurl = url + "/"; + } + } + + public void setCoreName(String core) { + this.corename = core; + } + + /** + * Core query implementation + * all query and search routines will use this routine to query the remote system + * + * @param query + * @return list of solr documents (metadata) accordng to local YaCy internal schema + */ + @Override + public List query(QueryParams query) { + + List docs = new ArrayList(); + Collection remotecorename = new ArrayList(); + remotecorename.add(corename); + ModifiableSolrParams msp = new SolrQuery(query.getQueryGoal().getQueryString(false)); + msp.add(CommonParams.QT, "/"); // important to override default append of /select + msp.add(CommonParams.ROWS, Integer.toString(query.itemsPerPage)); + try { + RemoteInstance instance = new RemoteInstance(baseurl, remotecorename, corename, 20000); + try { + SolrConnector solrConnector = new RemoteSolrConnector(instance, false, null); + try { + this.lastaccesstime = System.currentTimeMillis(); + SolrDocumentList docList = solrConnector.getDocumentListByParams(msp); + // convert to YaCy schema documentlist + for (SolrDocument doc : docList) { + URIMetadataNode anew = toYaCySchema(doc); + docs.add(anew); + } + } catch (IOException | SolrException e) { + } finally { + solrConnector.close(); + } + } catch (Throwable ee) { + } finally { + instance.close(); + } + } catch (IOException eee) { + } + return docs; + } +} diff --git a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java index 68fee2161..defc0e1cc 100644 --- a/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java +++ b/source/net/yacy/cora/federate/opensearch/OpenSearchConnector.java @@ -19,107 +19,45 @@ */ package net.yacy.cora.federate.opensearch; -import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.net.URL; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -import net.yacy.cora.federate.solr.connector.SolrConnector; -import net.yacy.cora.storage.Configuration; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.feed.RSSFeed; +import net.yacy.cora.document.feed.RSSMessage; +import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.AbstractFederateSearchConnector; +import net.yacy.cora.federate.FederateSearchConnector; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; -import net.yacy.cora.util.SpaceExceededException; -import net.yacy.document.parser.xml.opensearchdescriptionReader; -import net.yacy.kelondro.blob.Tables; -import net.yacy.search.Switchboard; -import net.yacy.search.SwitchboardConstants; -import net.yacy.search.query.SearchEvent; -import net.yacy.search.schema.WebgraphSchema; - -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.URIMetadataNode; +import net.yacy.search.query.QueryParams; +import net.yacy.search.schema.CollectionSchema; /** * Handling of queries to remote OpenSearch systems. Iterates to a list of - * configured systems until number of needed results are available. Uses a - * temporary work table to store search template urls for the iteration during - * search. + * configured systems until number of needed results are available. */ -public class OpenSearchConnector { - - private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf - private int size = 0; // remember the size of active opensearch targets - - public OpenSearchConnector(Switchboard sb, boolean createworktable) { - super(); - if (sb == null) { - return; - } - - confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); - - if (createworktable) { // read from config file and create worktable - sb.tables.clear("opensearchsys"); - try { - Configuration cfg = new Configuration(confFile); - - // copy active opensearch systems to a work table (opensearchsys) - Iterator cfgentries = cfg.entryIterator(); - while (cfgentries.hasNext()) { - Configuration.Entry e = cfgentries.next(); - if (e.enabled()) { - String title = e.key(); // get the title - String urlstr = e.getValue(); // get the search template url +public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector { - Tables.Data row = new Tables.Data(); - row.put("title", title); - row.put("url", urlstr); - try { - sb.tables.insert("opensearchsys", row); - } catch (final SpaceExceededException ex) { - ConcurrentLog.logException(ex); - } - } - } - size = sb.tables.size("opensearchsys"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); - } - } - } - - /** - * Sends a search request to remote systems listed in worktable until the - * searchevent contains less than needed results. Depending on already - * collected search results none to all configured systems are queried to - * complete available search results. - * if query search domain is LOCAL procedure does nothing. - */ - static public void query(Switchboard sb, SearchEvent theSearch) { - if (theSearch != null && sb != null) { - if (!theSearch.query.isLocal()) { - try { - Iterator ossysworktable = sb.tables.iterator("opensearchsys"); - //int needres = theSearch.query.neededResults(); // get number of needed results - while (ossysworktable.hasNext() /*&& theSearch.query.getResultCount() < needres*/) { - Tables.Row row = ossysworktable.next(); - String osurl = row.get("url", ""); - String name = row.get("title", ""); - sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name); - } - } catch (final IOException ex) { - ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys"); - } - } - } + @Override + public boolean init(final String name, final String urltemplate) { + this.baseurl = urltemplate; + this.instancename = name; + this.localcfg = null; // no field mapping needed + return true; } /** * replace Opensearchdescription search template parameter with actual values */ - private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { + private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}=' tmps = tmps.replace("{startIndex}", Integer.toString(start)); tmps = tmps.replace("{startPage}", ""); @@ -131,138 +69,76 @@ public class OpenSearchConnector { } /** - * add a opensearch target system to the config file + * queries remote system and returns the resultlist (waits until results + * transmitted or timeout) This is the main access routine used for the + * serach and query operation For internal access delay time, also the + * this.lastaccessed time needs to be set here. + * + * @return query results (metadata) with fields according to YaCy schema */ - public boolean add(String name, String url, boolean active, String comment) { - if (confFile == null) { - return false; - } + @Override + public List query(QueryParams query) { + List docs = new ArrayList(); + // see http://www.loc.gov/standards/sru/ + String searchurl = this.parseSearchTemplate(baseurl, query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage); try { - Configuration conf = new Configuration(confFile); - if (name != null && !name.isEmpty()) { - conf.add(name, null, active); - Configuration.Entry e = conf.get(name); - e.setValue(url); - e.setEnable(active); - e.setComment(comment); - conf.put(name, e); - try { - conf.commit(); - } catch (final IOException ex) { - ConcurrentLog.warn("OpenSearchConnector.add", "config file write error"); - } - return true; - } - } catch (final IOException e1) { - ConcurrentLog.logException(e1); - return false; - } - return false; - } - - /** - * Get the number of active remote opensearch target systems - */ - public int getSize() { - return size; - } - - /** - * Discover opensearch description links from local (embedded) Solr index using - * meta data field 'outboundlinks_tag_txt' and add found systems to the - * config file - * - * @return true if background discover job was started, false if job not started - */ - public boolean discoverFromSolrIndex(final Switchboard sb) { - if (sb == null) { - return false; - } - // check if needed Solr fields are available (selected) - if (!sb.index.fulltext().useWebgraph()) { - ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index"); - return false; - } - final SolrConnector connector = sb.index.fulltext().getWebgraphConnector(); - final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) - && ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) ) - && sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false); - if (!metafieldavailable) { - ConcurrentLog.warn("OpenSearchConnector.Discover", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on"); - return false; - } - // the solr query - final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search"; - final String[] webgraphqueryfields = { WebgraphSchema.target_protocol_s.getSolrFieldName() , WebgraphSchema.target_urlstub_s.getSolrFieldName()}; - // alternatively target_protocol_s + "://" +target_host_s + target_path_s - - final long numfound; - try { - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields); - numfound = docList.getNumFound(); - if (numfound == 0) { - ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job"); - return true; - } - ConcurrentLog.info("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); - return false; - } + MultiProtocolURL aurl = new MultiProtocolURL(MultiProtocolURL.unescape(searchurl)); + try { + this.lastaccesstime = System.currentTimeMillis(); + final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyIntranetCrawlerAgent); + byte[] result = httpClient.GETbytes(aurl, null, null, false); + RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); + if (rssReader != null) { + final RSSFeed feed = rssReader.getFeed(); + if (feed != null) { + for (final RSSMessage item : feed) { + try { + DigestURL uri = new DigestURL(item.getLink()); + + URIMetadataNode doc = new URIMetadataNode(uri); + doc.setField(CollectionSchema.charset_s.getSolrFieldName(), UTF8.charset.name()); + doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor()); + doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle()); + doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage()); + doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate()); + final String mime = TextParser.mimeOf(uri); + if (mime != null) { + doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime); + } + if (item.getCategory().isEmpty()) { + doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject())); + } else { + doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory()); + } + doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright()); - final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever + // TODO: we likely got only a search related snippet (take is as text content) + // we need a way to differentiate metadata from full crawl data in the index (would be also good for rwi transferred/received metadata) + // or considere to add this to snippet cache, without adding text_t + doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions()); - // job to iterate through Solr index to find links to opensearchdescriptions - // started as background job as connect timeouts may cause it run a long time - final Thread job = new Thread() { - @Override - public void run() { - try { - boolean doloop = true; - int loopnr = 0; - Set dblmem = new HashSet(); // temp memory for already checked url - while (doloop) { - ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound)); - SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents - loopnr++; - if (stoptime < System.currentTimeMillis()) {// stop after max 1h - doloop = false; - ConcurrentLog.info("OpenSearchConnector.Discover", "long running discover task aborted"); - } - if (docList != null && docList.size() > 0) { - Iterator docidx = docList.iterator(); - while (docidx.hasNext()) { - SolrDocument sdoc = docidx.next(); - - String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName()); - try { - URL url = new URL(hrefurltxt); - //TODO: check Blacklist - if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries - opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt); - if (os.getRSSorAtomUrl() != null) { - // add found system to config file - add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName")); - ConcurrentLog.info("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt); - } else { - ConcurrentLog.info("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt); - } - } - } catch (final MalformedURLException ex) { + if (item.getLat() != 0.0 && item.getLon() != 0.0) { + doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon()); + } + if (item.getSize() > 0) { + doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize()); } + + docs.add(doc); + } catch (final MalformedURLException e) { } - } else { - doloop = false; } + ConcurrentLog.info("OpenSerachConnector", "received " + docs.size() + " results from " + this.instancename); } - ConcurrentLog.info("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)"); - } catch (final IOException ex) { - ConcurrentLog.logException(ex); } + } catch (IOException ex) { + ConcurrentLog.logException(ex); + ConcurrentLog.info("OpenSearchConnector", "no connection to " + searchurl); } - }; - job.start(); - return true; + } catch (MalformedURLException ee) { + ConcurrentLog.warn("OpenSearchConnector", "malformed url " + searchurl); + } + return docs; } } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 644763ff7..e2f53f647 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -69,7 +69,7 @@ public class URIMetadataNode extends SolrDocument { private static final long serialVersionUID = -256046934741561968L; protected String keywords = null; - protected DigestURL url = null; + protected DigestURL url; protected Bitfield flags = null; protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected double lat = Double.NaN, lon = Double.NaN; @@ -150,7 +150,6 @@ public class URIMetadataNode extends SolrDocument { for (String name : doc.getFieldNames()) { this.addField(name, doc.getFieldValue(name)); } - this.snippet = ""; Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result this.score = scorex == null ? 0.0f : scorex.floatValue(); final byte[] hash = ASCII.getBytes(getString(CollectionSchema.id)); // TODO: can we trust this id ? @@ -169,6 +168,24 @@ public class URIMetadataNode extends SolrDocument { this.score = scorex; } + public URIMetadataNode (final String urlstr) { + super(); + try { + url = new DigestURL(urlstr); + this.setField(CollectionSchema.sku.name(), url.toNormalform(true)); + this.setField(CollectionSchema.id.name(), ASCII.String(url.hash())); + } catch (final MalformedURLException e) { + ConcurrentLog.logException(e); + this.url = null; + } + } + public URIMetadataNode(DigestURL theurl) { + super(); + url = theurl; + this.setField(CollectionSchema.sku.name(), url.toNormalform(true)); + this.setField(CollectionSchema.id.name(), ASCII.String(url.hash())); + } + /** * Get the content domain of a document. This tries to get the content domain from the mime type * and if this fails it uses alternatively the content domain from the file extension. diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index d735dba3a..084faae18 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -714,16 +714,7 @@ public final class Switchboard extends serverSwitch { this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); this.surrogatesOutPath.mkdirs(); */ - // copy opensearch heuristic config (if not exist) - final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); - if (!osdConfig.exists()) { - final File osdDefaultConfig = new File(getAppPath(), "defaults/heuristicopensearch.conf"); - this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath()); - try { - Files.copy(osdDefaultConfig, osdConfig); - } catch (final IOException ex) { } - } - + // create the release download directory this.releasePath = getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); @@ -3615,7 +3606,9 @@ public final class Switchboard extends serverSwitch { * @param urlpattern the search query url (e.g. http://search.org?query=searchword) * @param searchEvent * @param feedName short/internal name of the remote system + * @deprecated use FederateSearchManager(SearchEvent) instead */ + @Deprecated // not used (since 2015-01-18, v1.81) public final void heuristicRSS( final String urlpattern, final SearchEvent searchEvent,