refactor opensearch heuristic

introduce FederateSearchManager handling search heuristic to external systems via specific FederateSearchConnectors,
which provide the query() functionallity, the translation to YaCy schema .toYaCySchema() and the search() routine to deliver results to searchevents, which is generally implemented in Abstract connector.
The manager enforces now a min 15s delay between calls to external systems.
Besides the OpensearchConnector a SolrFederateSearchConnector is available. It uses a additional config file for fieldname translation.

default heuristicopensearch.conf: 
- openbdb.com removed - seems not longer to deliver results
- config via solrconnector to  datacite.org added (large technical library archive)
pull/1/head
reger 10 years ago
parent 3b51636ecb
commit 24f68a4eb7

@ -0,0 +1,32 @@
## API datacite.org
## This service is also available as an API. We use Solr Search Handler for our API calls, the endpoint is: http://search.datacite.org/api.
## Please check Solr's common query parameters documentation in order to understand how to use API.
## Examples
## http://search.datacite.org/api?q=wind simple search for wind
## http://search.datacite.org/api?q=wind&fl=doi,title&rows=5 search for wind, retrieve only doi and title, and return (at max.) 5 results
## http://search.datacite.org/api?q=wind&fl=doi,title&wt=csv csv output
## http://search.datacite.org/api?q=wind&fl=doi,title&wt=json&indent=true json output
## YaCy solrconnector specific settings
## the basic url to acces the system
_baseurl = http://search.datacite.org/
## Solr core, is appended to the _baseurl
_corename = api
## some systems store a identifier instead of a url for the resource, the prefix is prepended the identifier in _skufieldname
_skuprefix = http://dx.doi.org/
## the field name of the url of resource (in yacy/solr = sku)
_skufieldname = doi
## field mappings
## YaCyFieldname = remoteFieldname
keywords = subject
author = creator
publisher_t = publisher
title = title
description_txt = description
language_s = language
text_t = description
size_i = size
coordinate_p = geoLocationPoint

@ -14,8 +14,15 @@
#Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko #Blekko = http://blekko.com/ws/{searchTerms}+/rss # get 20 results from blekko
#Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss # get results from Faroo news-search #Faroo-News = http://www.faroo.com/api?q={searchTerms}&start={startIndex}&length=20&l=en&src=news&f=rss # get results from Faroo news-search
#openBDB = http://www.openbdb.com/b/{searchTerms}.xml # Open Book Database
#WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs #WordPress.com = http://en.search.wordpress.com/?q={searchTerms}&f=feed&page={startPage?} #Search WordPress.com Blogs
#Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv #Sueddeutsche.de = http://suche.sueddeutsche.de/query/{searchTerms}?output=rss # Sueddeutsche Zeitung Artikel Archiv
#Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2 #Los Angeles Times = http://framework.latimes.com/?s={searchTerms}&feed=rss2
#Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web #Archive-It = http://archive-it.org/seam/resource/opensearch?q={searchTerms}&n=20 # archiving cultural heritage on the web
## In addition to OpenSearch systems other connectors are available to query foreign systems
## the syntax is
## SystemName = cfgfile:_connectortype_:_schemaconfig_
## where cfgfile: is a fix prefix (to signal this is not a opensearch url)
## _connectortype_ is the type of connector to use ( available is solrconnector )
## _schemaconfig_ is the config file with filed name mappings (the file has to exist in DATA/SETTINGS/federatecfg
#datacite.org = cfgfile:solrconnector:datacite.solr.schema # International Consortium for data citation

@ -25,7 +25,6 @@
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import com.google.common.io.Files;
import java.io.File; import java.io.File;
@ -37,9 +36,10 @@ import net.yacy.search.Switchboard;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import net.yacy.cora.federate.FederateSearchManager;
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.storage.Files;
import net.yacy.search.SwitchboardConstants; import net.yacy.search.SwitchboardConstants;
import net.yacy.search.schema.WebgraphSchema; import net.yacy.search.schema.WebgraphSchema;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -66,9 +66,9 @@ public class ConfigHeuristics_p {
if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false); if (post.containsKey("searchresultglobal_off")) sb.setConfig(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL, false);
if (post.containsKey("opensearch_on")) { if (post.containsKey("opensearch_on")) {
sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true); sb.setConfig(SwitchboardConstants.HEURISTIC_OPENSEARCH, true);
// re-read config (and create work table) // re-read config
OpenSearchConnector os = new OpenSearchConnector(sb, true); FederateSearchManager.getManager().init(sb.getDataPath().getAbsolutePath()+ "DATA/SETTINGS/heuristicopensearch.conf");
if (os.getSize() == 0) { if (FederateSearchManager.getManager().getSize() == 0) {
osderrmsg = "no active search targets are configured"; osderrmsg = "no active search targets are configured";
} }
} }
@ -77,8 +77,8 @@ public class ConfigHeuristics_p {
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name()) final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name())); && (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()));
if (metafieldavailable) { if (metafieldavailable) {
OpenSearchConnector osc = new OpenSearchConnector(sb, false); //OpenSearchConnector osc = new OpenSearchConnector(sb, false);
if (osc.discoverFromSolrIndex(sb)) { if (FederateSearchManager.getManager().discoverFromSolrIndex(sb)) {
osderrmsg = "started background search for target systems, refresh page after some minutes"; osderrmsg = "started background search for target systems, refresh page after some minutes";
} else { } else {
osderrmsg = "Error: webgraph Solr index not enabled"; osderrmsg = "Error: webgraph Solr index not enabled";
@ -98,8 +98,7 @@ public class ConfigHeuristics_p {
if (tmpname != null && tmpurl !=null) { if (tmpname != null && tmpurl !=null) {
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) { if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
final String tmpcomment = post.get("ossys_newcomment"); final String tmpcomment = post.get("ossys_newcomment");
OpenSearchConnector osc = new OpenSearchConnector(sb,false); FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment);
osc.add (tmpname,tmpurl,false,tmpcomment);
} else osderrmsg = "Url template must contain '{searchTerms}'"; } else osderrmsg = "Url template must contain '{searchTerms}'";
} }
} }
@ -143,6 +142,10 @@ public class ConfigHeuristics_p {
if ((post.containsKey("resettodefaultosdlist") || !osdConfig.exists()) && osdDefaultConfig.exists()) { if ((post.containsKey("resettodefaultosdlist") || !osdConfig.exists()) && osdDefaultConfig.exists()) {
try { try {
Files.copy(osdDefaultConfig, osdConfig); Files.copy(osdDefaultConfig, osdConfig);
File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
if (!defdir.exists()) {
Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
}
} catch (final IOException ex) { } catch (final IOException ex) {
osderrmsg = "file I/O error during copy"; osderrmsg = "file I/O error during copy";
} }
@ -240,7 +243,7 @@ public class ConfigHeuristics_p {
// re-read config (and create/update work table) // re-read config (and create/update work table)
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) { if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, true)) {
new OpenSearchConnector(sb, true); FederateSearchManager.getManager().init(f.getAbsolutePath());
} }
} }
} }

@ -127,8 +127,8 @@ public class ConfigNetwork_p
sb.peers.mySeed().setPeerTags(MapTools.string2set(normalizedList(post.get("peertags")), ",")); sb.peers.mySeed().setPeerTags(MapTools.string2set(normalizedList(post.get("peertags")), ","));
} }
sb.setConfig("cluster.mode", post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER)); sb.setConfig(SwitchboardConstants.CLUSTER_MODE, post.get(SwitchboardConstants.CLUSTER_MODE, SwitchboardConstants.CLUSTER_MODE_PUBLIC_PEER));
sb.setConfig("cluster.peers.ipport", checkIPPortList(post.get("cluster.peers.ipport", ""))); sb.setConfig(SwitchboardConstants.CLUSTER_PEERS_IPPORT, checkIPPortList(post.get(SwitchboardConstants.CLUSTER_PEERS_IPPORT, "")));
sb.setConfig( sb.setConfig(
"cluster.peers.yacydomain", "cluster.peers.yacydomain",
checkYaCyDomainList(post.get("cluster.peers.yacydomain", ""))); checkYaCyDomainList(post.get("cluster.peers.yacydomain", "")));

@ -45,7 +45,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.opensearch.OpenSearchConnector; import net.yacy.cora.federate.FederateSearchManager;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
@ -719,10 +719,10 @@ public class yacysearch {
sb.heuristicSite(theSearch, modifier.sitehost); sb.heuristicSite(theSearch, modifier.sitehost);
} }
if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) { if ( heuristicBlekko >= 0 && authenticated && !stealthmode ) {
OpenSearchConnector.query(sb, theSearch); FederateSearchManager.getManager().search(theSearch);
} }
if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) { if (sb.getConfigBool(SwitchboardConstants.HEURISTIC_OPENSEARCH, false) && authenticated && !stealthmode) {
OpenSearchConnector.query(sb, theSearch); FederateSearchManager.getManager().search(theSearch);
} }
} }

@ -0,0 +1,197 @@
/**
* AbstractFederateSearchConnector.java
* Copyright 2015 by Burkhard Buelte
* First released 19.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import javax.servlet.http.HttpServletResponse;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.SolrType;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.Switchboard;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
/**
* Base implementation class for Federated Search Connectors providing the basic
* funcitonality to search none YaCy systems
* <ul>
* <li> init() to read config file
* <li> toYaCySchema() to convert remote schema fields to YaCy internal schema
* names, called by query()
* <li> query() needs to be implemented in specific connectors
* <li> search() call's query() in a thread and adds results to internal search request.
* </ul>
* Subclasses should/need to override query() and maybe toYaCySchema() if more
* is needed as a basic field mapping
*/
abstract public class AbstractFederateSearchConnector implements FederateSearchConnector {
public String instancename; // just a identifying name
protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname
public long lastaccesstime = -1; // last time accessed, used for search delay calculation
protected String baseurl;
/**
* Inits the connector with the remote field names and matches to yacy
* schema and other specific settings from config file. Every connector
* needs at least a query target (where to query) and some definition to
* convert the remote serch result to the internal result presentation
* (field mapping)
*
* @param instanceName internal name
* @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA
* @return true if success false if not
*/
@Override
public boolean init(String instance, String cfgFileName) {
this.instancename = instance;
File instanceCfgFile = new File(cfgFileName);
if (instanceCfgFile.exists()) {
try {
this.localcfg = new SchemaConfiguration(instanceCfgFile);
} catch (IOException ex) {
ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName);
return false;
}
// mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url
if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) {
return true;
} else {
ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName);
return false;
}
} else {
this.localcfg = null;
return false;
}
}
/**
* queries a remote system and adds the results to the searchevent and to
* the crawler if addResultsToLocalIndex is true
*
* @param theSearch receiving the results
*/
@Override
public void search(final SearchEvent theSearch) {
final Thread job = new Thread() {
@Override
public void run() {
Thread.currentThread().setName("heuristic:" + instancename);
theSearch.oneFeederStarted();
List<URIMetadataNode> doclist = query(theSearch.getQuery());
if (doclist != null) {
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null
theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
for (URIMetadataNode doc : doclist) {
theSearch.addHeuristic(doc.hash(), instancename, false);
}
}
// that's all we need to display serach result
theSearch.oneFeederTerminated();
// optional: add to crawler to get the full resource (later)
if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) {
Collection<DigestURL> urls = new ArrayList<DigestURL>();
for (URIMetadataNode doc : doclist) {
urls.add(doc.url());
}
Switchboard.getSwitchboard().addToCrawler(urls, false);
}
}
};
job.start();
}
/**
* Converts a remote schema result to YaCy schema using the fieldname
* mapping provided as config file
*
* @param remote result (with remote fieldnames)
* @return SolrDocument with field names according to the YaCy schema
*/
protected URIMetadataNode toYaCySchema(final SolrDocument doc) {
// set YaCy id
String urlstr;
if (localcfg.contains("sku")) {
urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue());
} else {
urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue());
if (this.localcfg.contains("_skuprefix")) {
String skuprefix = this.localcfg.get("_skuprefix").getValue();
urlstr = skuprefix + urlstr;
}
}
URIMetadataNode newdoc = new URIMetadataNode(urlstr);
Iterator<Configuration.Entry> it = localcfg.entryIterator();
while (it.hasNext()) {
Configuration.Entry et = it.next();
String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname
String remotefieldname = et.getValue();
if (remotefieldname != null && !remotefieldname.isEmpty()) {
if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config
SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname);
if (est.isMultiValued()) {
if (doc.getFieldValues(remotefieldname) != null) {
newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); //
}
} else {
if (doc.getFieldValue(remotefieldname) != null) {
Object val = doc.getFirstValue(remotefieldname);
// watch out for type conversion
try {
if (est.getType() == SolrType.num_integer && val instanceof String) {
newdoc.setField(yacyfieldname, Integer.parseInt((String) val));
} else {
newdoc.setField(yacyfieldname, val);
}
} catch (Exception ex) {
continue; // catch possible parse or type mismatch, skip the field
}
}
}
}
}
}
newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required
return newdoc;
}
}

@ -0,0 +1,62 @@
/**
* FederateSearchConnector.java
* Copyright 2015 by Burkhard Buelte
* First released 19.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import java.util.List;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SearchEvent;
/**
* Interface for a query connector to search and gather query results from none
* YaCy systems (for the YaCy heuristic options)
*/
public interface FederateSearchConnector {
/**
* Load the configuration for this connector every connector needs at least
* a query target (where to query) and some definition to convert the remote
* serch result to the internal result presentation (field mapping)
*
* @param instanceName is also the name of the config file DATA/SETTINGS/instanceName.schema
* @param cfg config parameter
* @return true if success false if not
*/
abstract boolean init(String instanceName, String cfg);
/**
* Queries a remote system and adds the result metadata to the search events
* result list. If SearchEvent.addResultsToLocalIndex (=default) result urls
* are added to the crawler.
* @param theSearch
*/
abstract void search(SearchEvent theSearch);
/**
* Queries a remote system and returns the search result with field names
* according to YaCy schema.
*
* @param query
* @return result (metadata) in YaCy schema format
*/
abstract List<URIMetadataNode> query(QueryParams query);
}

@ -0,0 +1,427 @@
/**
* FederateSearchManager.java
* Copyright 2015 by Burkhard Buelte
* First released 19.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.storage.Files;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.xml.opensearchdescriptionReader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.QueryModifier;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/**
* Handling of queries to configured remote OpenSearch systems.
*/
public class FederateSearchManager {
private final int accessDelay = 15000; // delay between connects (in ms)
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
private HashSet<AbstractFederateSearchConnector> conlist; // connector list
protected Configuration cfg;//PropertiesConfiguration cfg;
private static FederateSearchManager manager = null; // self referenc for static .getManager()
public FederateSearchManager(Switchboard sb) {
super();
this.conlist = new HashSet<AbstractFederateSearchConnector>();
// from here we need Switchboard settings
if (sb == null) {
return;
}
// Data needed active name, url(template), desc, rule-when-to-use, specifics
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (!confFile.exists()) {
try {
Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile);
File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
if (!defdir.exists()) {
Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
}
} catch (IOException ex) {
}
}
// read settings config file
if (confFile.exists()) {
try {
cfg = new Configuration(confFile);
Iterator<Entry> it = cfg.entryIterator();
while (it.hasNext()) {
Entry cfgentry = it.next();
String url = cfgentry.getValue();
if (cfgentry.enabled() && url != null && !url.isEmpty()) {
String name = cfgentry.key();
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
// format prefix:connectortype:configfilename
// example cfgfile:solrconnector:testsys.solr.schema
String[] parts = url.split(":");
if (parts[1].equalsIgnoreCase("solrconnector")) {
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) {
conlist.add(sfc);
}
} else {
ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
}
} else { // handle opensearch url template
OpenSearchConnector osc = new OpenSearchConnector();
if (osc.init(name, url)) {
conlist.add(osc);
}
}
}
}
} catch (IOException ex) {
ConcurrentLog.logException(ex);
}
}
manager = this; // reference for static access via .getManager()
}
/**
* Get instance of this manager. There should be only one instance running,
* use this to get or initialize the manager.
*
* @return
*/
public static FederateSearchManager getManager() {
if (manager == null) {
manager = new FederateSearchManager(Switchboard.getSwitchboard());
}
return manager;
}
/**
* Sends a query request to remote systems configured.
* If search query domain is LOCAL procedure does nothing.
*
* @param theSearch
*/
public void search(SearchEvent theSearch) {
if (theSearch != null) {
if (!theSearch.query.isLocal()) {
Set<AbstractFederateSearchConnector> picklist = getBest(theSearch.getQuery());
for (AbstractFederateSearchConnector fsc : picklist) {
fsc.search(theSearch);
}
}
}
}
/**
* Sends a query to configured remote systems.
*
* @param query
* @return list of results according to YaCy schema
*/
public List<URIMetadataNode> query(QueryParams query) {
if (query.isLocal()) {
List<URIMetadataNode> sdl = new ArrayList<URIMetadataNode>();
Set<AbstractFederateSearchConnector> picklist = getBest(query);
for (AbstractFederateSearchConnector fsc : picklist) {
sdl.addAll(fsc.query(query));
}
return sdl;
} else {
return null;
}
}
/**
* Takes a search string, converts it to queryparams and calls the
* query(queryparams)
*
* @param querystr
* @return SolrDocumentlist of remote query results according to YaCy schema
*/
public List<URIMetadataNode> query(String querystr) {
final QueryGoal qg = new QueryGoal(querystr);
final Switchboard sb = Switchboard.getSwitchboard();
Bitfield filter = new Bitfield();
final QueryParams query = new QueryParams(
qg,
new QueryModifier(),
Integer.MAX_VALUE,
"",
Classification.ContentDomain.ALL,
"", //lang
null,
CacheStrategy.IFFRESH,
100, 0, //count, offset
".*", //urlmask
null,
null,
QueryParams.Searchdom.LOCAL,
filter,
false,
null,
MultiProtocolURL.TLD_any_zone_filter,
"",
false,
sb.index,
sb.getRanking(),
"",//userAgent
false,
false,
0.0, 0.0, -1.0,
new String[0]);
return query(query);
}
/**
* Add a search target system/connector to the config file
*
* @param urlTemplate query template url
* @return successful added
*/
public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) {
if (confFile == null) {
return false;
}
try {
Configuration conf = new Configuration(confFile);
if (name != null && !name.isEmpty()) {
conf.add(name, null, active);
Configuration.Entry e = conf.get(name);
e.setValue(urlTemplate);
e.setEnable(active);
e.setComment(comment);
conf.put(name, e);
try {
conf.commit();
if (active) {
OpenSearchConnector osd = new OpenSearchConnector();
if (osd.init(name, urlTemplate)) {
conlist.add(osd);
}
}
} catch (final IOException ex) {
ConcurrentLog.warn("FederateSearchManager", "config file write error");
}
return true;
}
} catch (final IOException e1) {
ConcurrentLog.logException(e1);
return false;
}
return false;
}
/**
* Get the number of active remote query target systems
*/
public int getSize() {
return conlist.size();
}
/**
* Get best systems from configured targets for this search
*
* @param theSearch
* @return list of searchtargetconnectors
*/
protected Set<AbstractFederateSearchConnector> getBest(final QueryParams query) {
HashSet<AbstractFederateSearchConnector> retset = new HashSet<AbstractFederateSearchConnector>();
// currently only enforces limits (min access delay, frequency)
for (AbstractFederateSearchConnector fsc : conlist) {
// check access time
if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system
retset.add(fsc);
}
}
return retset;
}
/**
* Discover opensearch description links from local (embedded) Solr index
* using meta data field 'outboundlinks_tag_txt' and add found systems to
* the config file
*
* @return true if background discover job was started, false if job not
* started
*/
public boolean discoverFromSolrIndex(Switchboard sb) {
if (sb == null) {
return false;
}
// check if needed Solr fields are available (selected)
if (!sb.index.fulltext().useWebgraph()) {
ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index");
return false;
}
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
if (!metafieldavailable) {
ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
return false;
}
// the solr search
final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()};
// alternatively target_protocol_s + "://" +target_host_s + target_path_s
final long numfound;
try {
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
numfound = docList.getNumFound();
if (numfound == 0) {
ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job");
return true;
}
ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
return false;
}
final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
// job to iterate through Solr index to find links to opensearchdescriptions
// started as background job as connect timeouts may cause it run a long time
final Thread job = new Thread() {
@Override
public void run() {
try {
boolean doloop = true;
int loopnr = 0;
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
while (doloop) {
ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents
loopnr++;
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
doloop = false;
ConcurrentLog.info("FederateSearchManager", "long running discover task aborted");
}
if (docList != null && docList.size() > 0) {
Iterator<SolrDocument> docidx = docList.iterator();
while (docidx.hasNext()) {
SolrDocument sdoc = docidx.next();
String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
try {
URL url = new URL(hrefurltxt);
//TODO: check Blacklist
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
if (os.getRSSorAtomUrl() != null) {
// add found system to config file
addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
} else {
ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
}
}
} catch (final MalformedURLException ex) {
}
}
} else {
doloop = false;
}
}
ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
}
}
};
job.start();
return true;
}
/**
* Read or reread opensearch config file and initialize connectors
*
* @param cfgFileName
* @return true if successful
*/
public boolean init(String cfgFileName) {
confFile = new File(cfgFileName);
if (confFile.exists()) {
try {
cfg = new Configuration(confFile);
if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries
Iterator<Entry> it = cfg.entryIterator();
while (it.hasNext()) {
Entry cfgentry = it.next();
if (cfgentry.enabled()) { // hold only enabled in memory
String name = cfgentry.key();
String url = cfgentry.getValue();
if (url != null && !url.isEmpty()) {
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
// config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file
// example cfgfile:solrconnector:testsys.solr.schema
String[] parts = url.split(":");
if (parts[1].equalsIgnoreCase("solrconnector")) {
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) {
conlist.add(sfc);
}
} else {
ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
}
} else { // handle opensearch url template
OpenSearchConnector osd;
osd = new OpenSearchConnector();
if (osd.init(name, url)) {
conlist.add(osd);
}
}
}
}
}
} catch (IOException ex) {
ConcurrentLog.logException(ex);
}
}
return true;
}
}

@ -0,0 +1,119 @@
/**
* SolrFederateSearchConnector.java
* Copyright 2015 by Burkhard Buelte
* First released 19.01.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.federate;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.query.QueryParams;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams;
/**
* Search connecter to collect query results from remote Solr systems which
* provide results as Solr documents
*/
public class SolrFederateSearchConnector extends AbstractFederateSearchConnector {
private String corename;
@Override
public boolean init(String instance, String cfgFileName) {
boolean initResult = super.init(instance, cfgFileName); // init local schema cfg
if (initResult) {
if (this.localcfg.contains("_baseurl")) {
setBaseurl(this.localcfg.get("_baseurl").getValue());
} else {
ConcurrentLog.config(instance, "no _baseurl given in config file "+cfgFileName);
initResult = false;
}
if (this.localcfg.contains("_corename")) {
setCoreName(this.localcfg.get("_corename").getValue());
} else {
ConcurrentLog.config(instance, "no _corename given in config file "); // not mandatory
this.corename = "";
}
}
return initResult;
}
public void setBaseurl(String url) {
if (url.endsWith("/")) {
this.baseurl = url;
} else {
this.baseurl = url + "/";
}
}
public void setCoreName(String core) {
this.corename = core;
}
/**
* Core query implementation
* all query and search routines will use this routine to query the remote system
*
* @param query
* @return list of solr documents (metadata) accordng to local YaCy internal schema
*/
@Override
public List<URIMetadataNode> query(QueryParams query) {
List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();
Collection<String> remotecorename = new ArrayList<String>();
remotecorename.add(corename);
ModifiableSolrParams msp = new SolrQuery(query.getQueryGoal().getQueryString(false));
msp.add(CommonParams.QT, "/"); // important to override default append of /select
msp.add(CommonParams.ROWS, Integer.toString(query.itemsPerPage));
try {
RemoteInstance instance = new RemoteInstance(baseurl, remotecorename, corename, 20000);
try {
SolrConnector solrConnector = new RemoteSolrConnector(instance, false, null);
try {
this.lastaccesstime = System.currentTimeMillis();
SolrDocumentList docList = solrConnector.getDocumentListByParams(msp);
// convert to YaCy schema documentlist
for (SolrDocument doc : docList) {
URIMetadataNode anew = toYaCySchema(doc);
docs.add(anew);
}
} catch (IOException | SolrException e) {
} finally {
solrConnector.close();
}
} catch (Throwable ee) {
} finally {
instance.close();
}
} catch (IOException eee) {
}
return docs;
}
}

@ -19,107 +19,45 @@
*/ */
package net.yacy.cora.federate.opensearch; package net.yacy.cora.federate.opensearch;
import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.util.ArrayList;
import java.util.HashSet; import java.util.Arrays;
import java.util.Iterator; import java.util.List;
import java.util.Set; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.document.feed.RSSMessage;
import net.yacy.cora.storage.Configuration; import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.AbstractFederateSearchConnector;
import net.yacy.cora.federate.FederateSearchConnector;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.TextParser;
import net.yacy.document.parser.xml.opensearchdescriptionReader; import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.blob.Tables; import net.yacy.search.query.QueryParams;
import net.yacy.search.Switchboard; import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
/** /**
* Handling of queries to remote OpenSearch systems. Iterates to a list of * Handling of queries to remote OpenSearch systems. Iterates to a list of
* configured systems until number of needed results are available. Uses a * configured systems until number of needed results are available.
* temporary work table to store search template urls for the iteration during
* search.
*/ */
public class OpenSearchConnector { public class OpenSearchConnector extends AbstractFederateSearchConnector implements FederateSearchConnector {
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
private int size = 0; // remember the size of active opensearch targets
public OpenSearchConnector(Switchboard sb, boolean createworktable) {
super();
if (sb == null) {
return;
}
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (createworktable) { // read from config file and create worktable
sb.tables.clear("opensearchsys");
try {
Configuration cfg = new Configuration(confFile);
// copy active opensearch systems to a work table (opensearchsys)
Iterator<Configuration.Entry> cfgentries = cfg.entryIterator();
while (cfgentries.hasNext()) {
Configuration.Entry e = cfgentries.next();
if (e.enabled()) {
String title = e.key(); // get the title
String urlstr = e.getValue(); // get the search template url
Tables.Data row = new Tables.Data(); @Override
row.put("title", title); public boolean init(final String name, final String urltemplate) {
row.put("url", urlstr); this.baseurl = urltemplate;
try { this.instancename = name;
sb.tables.insert("opensearchsys", row); this.localcfg = null; // no field mapping needed
} catch (final SpaceExceededException ex) { return true;
ConcurrentLog.logException(ex);
}
}
}
size = sb.tables.size("opensearchsys");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
}
}
}
/**
* Sends a search request to remote systems listed in worktable until the
* searchevent contains less than needed results. Depending on already
* collected search results none to all configured systems are queried to
* complete available search results.
* if query search domain is LOCAL procedure does nothing.
*/
static public void query(Switchboard sb, SearchEvent theSearch) {
if (theSearch != null && sb != null) {
if (!theSearch.query.isLocal()) {
try {
Iterator<Tables.Row> ossysworktable = sb.tables.iterator("opensearchsys");
//int needres = theSearch.query.neededResults(); // get number of needed results
while (ossysworktable.hasNext() /*&& theSearch.query.getResultCount() < needres*/) {
Tables.Row row = ossysworktable.next();
String osurl = row.get("url", "");
String name = row.get("title", "");
sb.heuristicRSS(parseSearchTemplate(osurl, theSearch.query.getQueryGoal().getQueryString(false), 0, theSearch.query.itemsPerPage), theSearch, name);
}
} catch (final IOException ex) {
ConcurrentLog.warn("OpenSearchConnector.query", "failed reading table opensearchsys");
}
}
}
} }
/** /**
* replace Opensearchdescription search template parameter with actual values * replace Opensearchdescription search template parameter with actual values
*/ */
private static String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) { private String parseSearchTemplate(String searchurltemplate, String query, int start, int rows) {
String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}=' String tmps = searchurltemplate.replaceAll("\\?}", "}"); // some optional parameters may include question mark '{param?}='
tmps = tmps.replace("{startIndex}", Integer.toString(start)); tmps = tmps.replace("{startIndex}", Integer.toString(start));
tmps = tmps.replace("{startPage}", ""); tmps = tmps.replace("{startPage}", "");
@ -131,138 +69,76 @@ public class OpenSearchConnector {
} }
/** /**
* add a opensearch target system to the config file * queries remote system and returns the resultlist (waits until results
* transmitted or timeout) This is the main access routine used for the
* serach and query operation For internal access delay time, also the
* this.lastaccessed time needs to be set here.
*
* @return query results (metadata) with fields according to YaCy schema
*/ */
public boolean add(String name, String url, boolean active, String comment) { @Override
if (confFile == null) { public List<URIMetadataNode> query(QueryParams query) {
return false; List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();
}
// see http://www.loc.gov/standards/sru/
String searchurl = this.parseSearchTemplate(baseurl, query.getQueryGoal().getQueryString(false), 0, query.itemsPerPage);
try { try {
Configuration conf = new Configuration(confFile); MultiProtocolURL aurl = new MultiProtocolURL(MultiProtocolURL.unescape(searchurl));
if (name != null && !name.isEmpty()) { try {
conf.add(name, null, active); this.lastaccesstime = System.currentTimeMillis();
Configuration.Entry e = conf.get(name); final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyIntranetCrawlerAgent);
e.setValue(url); byte[] result = httpClient.GETbytes(aurl, null, null, false);
e.setEnable(active); RSSReader rssReader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
e.setComment(comment); if (rssReader != null) {
conf.put(name, e); final RSSFeed feed = rssReader.getFeed();
try { if (feed != null) {
conf.commit(); for (final RSSMessage item : feed) {
} catch (final IOException ex) { try {
ConcurrentLog.warn("OpenSearchConnector.add", "config file write error"); DigestURL uri = new DigestURL(item.getLink());
}
return true; URIMetadataNode doc = new URIMetadataNode(uri);
} doc.setField(CollectionSchema.charset_s.getSolrFieldName(), UTF8.charset.name());
} catch (final IOException e1) { doc.setField(CollectionSchema.author.getSolrFieldName(), item.getAuthor());
ConcurrentLog.logException(e1); doc.setField(CollectionSchema.title.getSolrFieldName(), item.getTitle());
return false; doc.setField(CollectionSchema.language_s.getSolrFieldName(), item.getLanguage());
} doc.setField(CollectionSchema.last_modified.getSolrFieldName(), item.getPubDate());
return false; final String mime = TextParser.mimeOf(uri);
} if (mime != null) {
doc.setField(CollectionSchema.content_type.getSolrFieldName(), mime);
/** }
* Get the number of active remote opensearch target systems if (item.getCategory().isEmpty()) {
*/ doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()));
public int getSize() { } else {
return size; doc.setField(CollectionSchema.keywords.getSolrFieldName(), Arrays.toString(item.getSubject()) + " " + item.getCategory());
} }
doc.setField(CollectionSchema.publisher_t.getSolrFieldName(), item.getCopyright());
/**
* Discover opensearch description links from local (embedded) Solr index using
* meta data field 'outboundlinks_tag_txt' and add found systems to the
* config file
*
* @return true if background discover job was started, false if job not started
*/
public boolean discoverFromSolrIndex(final Switchboard sb) {
if (sb == null) {
return false;
}
// check if needed Solr fields are available (selected)
if (!sb.index.fulltext().useWebgraph()) {
ConcurrentLog.severe("OpenSearchConnector.Discover", "Error on connecting to embedded Solr webgraph index");
return false;
}
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
&& ( sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()) )
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
if (!metafieldavailable) {
ConcurrentLog.warn("OpenSearchConnector.Discover", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
return false;
}
// the solr query
final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
final String[] webgraphqueryfields = { WebgraphSchema.target_protocol_s.getSolrFieldName() , WebgraphSchema.target_urlstub_s.getSolrFieldName()};
// alternatively target_protocol_s + "://" +target_host_s + target_path_s
final long numfound;
try {
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
numfound = docList.getNumFound();
if (numfound == 0) {
ConcurrentLog.info("OpenSearchConnector.Discover", "no results found, abort discover job");
return true;
}
ConcurrentLog.info("OpenSearchConnector.Discover", "start checking " + Long.toString(numfound) + " found index results");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
return false;
}
final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever // TODO: we likely got only a search related snippet (take is as text content)
// we need a way to differentiate metadata from full crawl data in the index (would be also good for rwi transferred/received metadata)
// or considere to add this to snippet cache, without adding text_t
doc.setField(CollectionSchema.text_t.getSolrFieldName(), item.getDescriptions());
// job to iterate through Solr index to find links to opensearchdescriptions if (item.getLat() != 0.0 && item.getLon() != 0.0) {
// started as background job as connect timeouts may cause it run a long time doc.setField(CollectionSchema.coordinate_p.getSolrFieldName(), item.getLat() + "," + item.getLon());
final Thread job = new Thread() { }
@Override if (item.getSize() > 0) {
public void run() { doc.setField(CollectionSchema.size_i.getSolrFieldName(), item.getSize());
try {
boolean doloop = true;
int loopnr = 0;
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
while (doloop) {
ConcurrentLog.info("OpenSearchConnector.Discover", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20,webgraphqueryfields); // check chunk of 20 result documents
loopnr++;
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
doloop = false;
ConcurrentLog.info("OpenSearchConnector.Discover", "long running discover task aborted");
}
if (docList != null && docList.size() > 0) {
Iterator<SolrDocument> docidx = docList.iterator();
while (docidx.hasNext()) {
SolrDocument sdoc = docidx.next();
String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
try {
URL url = new URL(hrefurltxt);
//TODO: check Blacklist
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
if (os.getRSSorAtomUrl() != null) {
// add found system to config file
add(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
ConcurrentLog.info("OpenSearchConnector.Discover", "added " + os.getShortName() + " " + hrefurltxt);
} else {
ConcurrentLog.info("OpenSearchConnector.Discover", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
}
}
} catch (final MalformedURLException ex) {
} }
docs.add(doc);
} catch (final MalformedURLException e) {
} }
} else {
doloop = false;
} }
ConcurrentLog.info("OpenSerachConnector", "received " + docs.size() + " results from " + this.instancename);
} }
ConcurrentLog.info("OpenSearchConnector.Discover", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
} catch (final IOException ex) {
ConcurrentLog.logException(ex);
} }
} catch (IOException ex) {
ConcurrentLog.logException(ex);
ConcurrentLog.info("OpenSearchConnector", "no connection to " + searchurl);
} }
}; } catch (MalformedURLException ee) {
job.start(); ConcurrentLog.warn("OpenSearchConnector", "malformed url " + searchurl);
return true; }
return docs;
} }
} }

@ -69,7 +69,7 @@ public class URIMetadataNode extends SolrDocument {
private static final long serialVersionUID = -256046934741561968L; private static final long serialVersionUID = -256046934741561968L;
protected String keywords = null; protected String keywords = null;
protected DigestURL url = null; protected DigestURL url;
protected Bitfield flags = null; protected Bitfield flags = null;
protected int imagec = -1, audioc = -1, videoc = -1, appc = -1; protected int imagec = -1, audioc = -1, videoc = -1, appc = -1;
protected double lat = Double.NaN, lon = Double.NaN; protected double lat = Double.NaN, lon = Double.NaN;
@ -150,7 +150,6 @@ public class URIMetadataNode extends SolrDocument {
for (String name : doc.getFieldNames()) { for (String name : doc.getFieldNames()) {
this.addField(name, doc.getFieldValue(name)); this.addField(name, doc.getFieldValue(name));
} }
this.snippet = "";
Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result Float scorex = (Float) doc.getFieldValue("score"); // this is a special field containing the ranking score of a search result
this.score = scorex == null ? 0.0f : scorex.floatValue(); this.score = scorex == null ? 0.0f : scorex.floatValue();
final byte[] hash = ASCII.getBytes(getString(CollectionSchema.id)); // TODO: can we trust this id ? final byte[] hash = ASCII.getBytes(getString(CollectionSchema.id)); // TODO: can we trust this id ?
@ -169,6 +168,24 @@ public class URIMetadataNode extends SolrDocument {
this.score = scorex; this.score = scorex;
} }
public URIMetadataNode (final String urlstr) {
super();
try {
url = new DigestURL(urlstr);
this.setField(CollectionSchema.sku.name(), url.toNormalform(true));
this.setField(CollectionSchema.id.name(), ASCII.String(url.hash()));
} catch (final MalformedURLException e) {
ConcurrentLog.logException(e);
this.url = null;
}
}
public URIMetadataNode(DigestURL theurl) {
super();
url = theurl;
this.setField(CollectionSchema.sku.name(), url.toNormalform(true));
this.setField(CollectionSchema.id.name(), ASCII.String(url.hash()));
}
/** /**
* Get the content domain of a document. This tries to get the content domain from the mime type * Get the content domain of a document. This tries to get the content domain from the mime type
* and if this fails it uses alternatively the content domain from the file extension. * and if this fails it uses alternatively the content domain from the file extension.

@ -714,16 +714,7 @@ public final class Switchboard extends serverSwitch {
this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath());
this.surrogatesOutPath.mkdirs(); this.surrogatesOutPath.mkdirs();
*/ */
// copy opensearch heuristic config (if not exist)
final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (!osdConfig.exists()) {
final File osdDefaultConfig = new File(getAppPath(), "defaults/heuristicopensearch.conf");
this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath());
try {
Files.copy(osdDefaultConfig, osdConfig);
} catch (final IOException ex) { }
}
// create the release download directory // create the release download directory
this.releasePath = this.releasePath =
getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT);
@ -3615,7 +3606,9 @@ public final class Switchboard extends serverSwitch {
* @param urlpattern the search query url (e.g. http://search.org?query=searchword) * @param urlpattern the search query url (e.g. http://search.org?query=searchword)
* @param searchEvent * @param searchEvent
* @param feedName short/internal name of the remote system * @param feedName short/internal name of the remote system
* @deprecated use FederateSearchManager(SearchEvent) instead
*/ */
@Deprecated // not used (since 2015-01-18, v1.81)
public final void heuristicRSS( public final void heuristicRSS(
final String urlpattern, final String urlpattern,
final SearchEvent searchEvent, final SearchEvent searchEvent,

Loading…
Cancel
Save