introduce FederateSearchManager handling search heuristic to external systems via specific FederateSearchConnectors, which provide the query() functionallity, the translation to YaCy schema .toYaCySchema() and the search() routine to deliver results to searchevents, which is generally implemented in Abstract connector. The manager enforces now a min 15s delay between calls to external systems. Besides the OpensearchConnector a SolrFederateSearchConnector is available. It uses a additional config file for fieldname translation. default heuristicopensearch.conf: - openbdb.com removed - seems not longer to deliver results - config via solrconnector to datacite.org added (large technical library archive)pull/1/head
parent
3b51636ecb
commit
24f68a4eb7
@ -0,0 +1,32 @@
|
||||
## API datacite.org
|
||||
## This service is also available as an API. We use Solr Search Handler for our API calls, the endpoint is: http://search.datacite.org/api.
|
||||
|
||||
## Please check Solr's common query parameters documentation in order to understand how to use API.
|
||||
## Examples
|
||||
|
||||
## http://search.datacite.org/api?q=wind simple search for wind
|
||||
## http://search.datacite.org/api?q=wind&fl=doi,title&rows=5 search for wind, retrieve only doi and title, and return (at max.) 5 results
|
||||
## http://search.datacite.org/api?q=wind&fl=doi,title&wt=csv csv output
|
||||
## http://search.datacite.org/api?q=wind&fl=doi,title&wt=json&indent=true json output
|
||||
|
||||
## YaCy solrconnector specific settings
|
||||
## the basic url to acces the system
|
||||
_baseurl = http://search.datacite.org/
|
||||
## Solr core, is appended to the _baseurl
|
||||
_corename = api
|
||||
## some systems store a identifier instead of a url for the resource, the prefix is prepended the identifier in _skufieldname
|
||||
_skuprefix = http://dx.doi.org/
|
||||
## the field name of the url of resource (in yacy/solr = sku)
|
||||
_skufieldname = doi
|
||||
|
||||
## field mappings
|
||||
## YaCyFieldname = remoteFieldname
|
||||
keywords = subject
|
||||
author = creator
|
||||
publisher_t = publisher
|
||||
title = title
|
||||
description_txt = description
|
||||
language_s = language
|
||||
text_t = description
|
||||
size_i = size
|
||||
coordinate_p = geoLocationPoint
|
@ -0,0 +1,197 @@
|
||||
/**
|
||||
* AbstractFederateSearchConnector.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 19.01.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.cora.federate;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.federate.solr.SchemaConfiguration;
|
||||
import net.yacy.cora.federate.solr.SchemaDeclaration;
|
||||
import net.yacy.cora.federate.solr.SolrType;
|
||||
import net.yacy.cora.sorting.ReversibleScoreMap;
|
||||
import net.yacy.cora.storage.Configuration;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
import net.yacy.search.schema.CollectionSchema;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
|
||||
/**
|
||||
* Base implementation class for Federated Search Connectors providing the basic
|
||||
* funcitonality to search none YaCy systems
|
||||
* <ul>
|
||||
* <li> init() to read config file
|
||||
* <li> toYaCySchema() to convert remote schema fields to YaCy internal schema
|
||||
* names, called by query()
|
||||
* <li> query() needs to be implemented in specific connectors
|
||||
* <li> search() call's query() in a thread and adds results to internal search request.
|
||||
* </ul>
|
||||
* Subclasses should/need to override query() and maybe toYaCySchema() if more
|
||||
* is needed as a basic field mapping
|
||||
*/
|
||||
abstract public class AbstractFederateSearchConnector implements FederateSearchConnector {
|
||||
|
||||
public String instancename; // just a identifying name
|
||||
protected SchemaConfiguration localcfg; // the schema conversion cfg for each fieldname, yacyname = remote fieldname
|
||||
public long lastaccesstime = -1; // last time accessed, used for search delay calculation
|
||||
protected String baseurl;
|
||||
|
||||
/**
|
||||
* Inits the connector with the remote field names and matches to yacy
|
||||
* schema and other specific settings from config file. Every connector
|
||||
* needs at least a query target (where to query) and some definition to
|
||||
* convert the remote serch result to the internal result presentation
|
||||
* (field mapping)
|
||||
*
|
||||
* @param instanceName internal name
|
||||
* @param cfgFileName e.g. DATA/SETTINGS/FEDERATECFG/instanceName.SCHEMA
|
||||
* @return true if success false if not
|
||||
*/
|
||||
@Override
|
||||
public boolean init(String instance, String cfgFileName) {
|
||||
this.instancename = instance;
|
||||
File instanceCfgFile = new File(cfgFileName);
|
||||
if (instanceCfgFile.exists()) {
|
||||
try {
|
||||
this.localcfg = new SchemaConfiguration(instanceCfgFile);
|
||||
} catch (IOException ex) {
|
||||
ConcurrentLog.config(this.instancename, "error reading schema " + cfgFileName);
|
||||
return false;
|
||||
}
|
||||
// mandatory to contain a mapping for "sku" or alternatively "cfg_skufieldname" for a conversion to a final url
|
||||
if (this.localcfg.contains(CollectionSchema.sku) || this.localcfg.contains("_skufieldname")) {
|
||||
return true;
|
||||
} else {
|
||||
ConcurrentLog.config(this.instancename, "mandatory mapping for sku or _skufieldname missing in " + cfgFileName);
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
this.localcfg = null;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* queries a remote system and adds the results to the searchevent and to
|
||||
* the crawler if addResultsToLocalIndex is true
|
||||
*
|
||||
* @param theSearch receiving the results
|
||||
*/
|
||||
@Override
|
||||
public void search(final SearchEvent theSearch) {
|
||||
|
||||
final Thread job = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
Thread.currentThread().setName("heuristic:" + instancename);
|
||||
theSearch.oneFeederStarted();
|
||||
List<URIMetadataNode> doclist = query(theSearch.getQuery());
|
||||
if (doclist != null) {
|
||||
Map<String, LinkedHashSet<String>> snippets = new HashMap<String, LinkedHashSet<String>>(); // add nodes doesn't allow null
|
||||
Map<String, ReversibleScoreMap<String>> facets = new HashMap<String, ReversibleScoreMap<String>>(); // add nodes doesn't allow null
|
||||
theSearch.addNodes(doclist, facets, snippets, false, instancename, doclist.size());
|
||||
|
||||
for (URIMetadataNode doc : doclist) {
|
||||
theSearch.addHeuristic(doc.hash(), instancename, false);
|
||||
}
|
||||
}
|
||||
// that's all we need to display serach result
|
||||
theSearch.oneFeederTerminated();
|
||||
|
||||
// optional: add to crawler to get the full resource (later)
|
||||
if (doclist != null && !doclist.isEmpty() && theSearch.addResultsToLocalIndex) {
|
||||
Collection<DigestURL> urls = new ArrayList<DigestURL>();
|
||||
for (URIMetadataNode doc : doclist) {
|
||||
urls.add(doc.url());
|
||||
}
|
||||
Switchboard.getSwitchboard().addToCrawler(urls, false);
|
||||
|
||||
}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a remote schema result to YaCy schema using the fieldname
|
||||
* mapping provided as config file
|
||||
*
|
||||
* @param remote result (with remote fieldnames)
|
||||
* @return SolrDocument with field names according to the YaCy schema
|
||||
*/
|
||||
protected URIMetadataNode toYaCySchema(final SolrDocument doc) {
|
||||
// set YaCy id
|
||||
String urlstr;
|
||||
if (localcfg.contains("sku")) {
|
||||
urlstr = (String) doc.getFieldValue(localcfg.get("sku").getValue());
|
||||
} else {
|
||||
urlstr = (String) doc.getFieldValue(localcfg.get("_skufieldname").getValue());
|
||||
if (this.localcfg.contains("_skuprefix")) {
|
||||
String skuprefix = this.localcfg.get("_skuprefix").getValue();
|
||||
urlstr = skuprefix + urlstr;
|
||||
}
|
||||
}
|
||||
|
||||
URIMetadataNode newdoc = new URIMetadataNode(urlstr);
|
||||
Iterator<Configuration.Entry> it = localcfg.entryIterator();
|
||||
while (it.hasNext()) {
|
||||
Configuration.Entry et = it.next();
|
||||
String yacyfieldname = et.key(); // config defines yacyfieldname = remotefieldname
|
||||
String remotefieldname = et.getValue();
|
||||
if (remotefieldname != null && !remotefieldname.isEmpty()) {
|
||||
if (Switchboard.getSwitchboard().index.fulltext().getDefaultConfiguration().contains(yacyfieldname)) { // check if in local config
|
||||
|
||||
SchemaDeclaration est = CollectionSchema.valueOf(yacyfieldname);
|
||||
if (est.isMultiValued()) {
|
||||
if (doc.getFieldValues(remotefieldname) != null) {
|
||||
newdoc.addField(yacyfieldname, doc.getFieldValues(remotefieldname)); //
|
||||
}
|
||||
} else {
|
||||
if (doc.getFieldValue(remotefieldname) != null) {
|
||||
Object val = doc.getFirstValue(remotefieldname);
|
||||
// watch out for type conversion
|
||||
try {
|
||||
if (est.getType() == SolrType.num_integer && val instanceof String) {
|
||||
newdoc.setField(yacyfieldname, Integer.parseInt((String) val));
|
||||
} else {
|
||||
newdoc.setField(yacyfieldname, val);
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
continue; // catch possible parse or type mismatch, skip the field
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
newdoc.addField(CollectionSchema.httpstatus_i.name(), HttpServletResponse.SC_OK); // yacy required
|
||||
return newdoc;
|
||||
}
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
/**
|
||||
* FederateSearchConnector.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 19.01.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.cora.federate;
|
||||
|
||||
import java.util.List;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
|
||||
|
||||
/**
|
||||
* Interface for a query connector to search and gather query results from none
|
||||
* YaCy systems (for the YaCy heuristic options)
|
||||
*/
|
||||
public interface FederateSearchConnector {
|
||||
|
||||
/**
|
||||
* Load the configuration for this connector every connector needs at least
|
||||
* a query target (where to query) and some definition to convert the remote
|
||||
* serch result to the internal result presentation (field mapping)
|
||||
*
|
||||
* @param instanceName is also the name of the config file DATA/SETTINGS/instanceName.schema
|
||||
* @param cfg config parameter
|
||||
* @return true if success false if not
|
||||
*/
|
||||
abstract boolean init(String instanceName, String cfg);
|
||||
|
||||
/**
|
||||
* Queries a remote system and adds the result metadata to the search events
|
||||
* result list. If SearchEvent.addResultsToLocalIndex (=default) result urls
|
||||
* are added to the crawler.
|
||||
* @param theSearch
|
||||
*/
|
||||
abstract void search(SearchEvent theSearch);
|
||||
|
||||
/**
|
||||
* Queries a remote system and returns the search result with field names
|
||||
* according to YaCy schema.
|
||||
*
|
||||
* @param query
|
||||
* @return result (metadata) in YaCy schema format
|
||||
*/
|
||||
abstract List<URIMetadataNode> query(QueryParams query);
|
||||
|
||||
}
|
@ -0,0 +1,427 @@
|
||||
/**
|
||||
* FederateSearchManager.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 19.01.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.cora.federate;
|
||||
|
||||
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import net.yacy.cora.document.analysis.Classification;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.federate.yacy.CacheStrategy;
|
||||
import net.yacy.cora.storage.Configuration;
|
||||
import net.yacy.cora.storage.Configuration.Entry;
|
||||
import net.yacy.cora.storage.Files;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.document.parser.xml.opensearchdescriptionReader;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.kelondro.util.Bitfield;
|
||||
import net.yacy.search.Switchboard;
|
||||
import net.yacy.search.SwitchboardConstants;
|
||||
import net.yacy.search.query.QueryGoal;
|
||||
import net.yacy.search.query.QueryModifier;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
import net.yacy.search.query.SearchEvent;
|
||||
import net.yacy.search.schema.WebgraphSchema;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
|
||||
/**
|
||||
* Handling of queries to configured remote OpenSearch systems.
|
||||
*/
|
||||
public class FederateSearchManager {
|
||||
|
||||
private final int accessDelay = 15000; // delay between connects (in ms)
|
||||
|
||||
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
|
||||
private HashSet<AbstractFederateSearchConnector> conlist; // connector list
|
||||
protected Configuration cfg;//PropertiesConfiguration cfg;
|
||||
private static FederateSearchManager manager = null; // self referenc for static .getManager()
|
||||
|
||||
public FederateSearchManager(Switchboard sb) {
|
||||
super();
|
||||
this.conlist = new HashSet<AbstractFederateSearchConnector>();
|
||||
|
||||
// from here we need Switchboard settings
|
||||
if (sb == null) {
|
||||
return;
|
||||
}
|
||||
// Data needed active name, url(template), desc, rule-when-to-use, specifics
|
||||
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
|
||||
if (!confFile.exists()) {
|
||||
try {
|
||||
Files.copy(new File(sb.appPath, "defaults/heuristicopensearch.conf"), confFile);
|
||||
File defdir = new File(sb.dataPath, "DATA/SETTINGS/federatecfg");
|
||||
if (!defdir.exists()) {
|
||||
Files.copy(new File(sb.appPath, "defaults/federatecfg"), defdir);
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
}
|
||||
}
|
||||
// read settings config file
|
||||
if (confFile.exists()) {
|
||||
try {
|
||||
cfg = new Configuration(confFile);
|
||||
Iterator<Entry> it = cfg.entryIterator();
|
||||
while (it.hasNext()) {
|
||||
Entry cfgentry = it.next();
|
||||
String url = cfgentry.getValue();
|
||||
if (cfgentry.enabled() && url != null && !url.isEmpty()) {
|
||||
String name = cfgentry.key();
|
||||
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
|
||||
// format prefix:connectortype:configfilename
|
||||
// example cfgfile:solrconnector:testsys.solr.schema
|
||||
String[] parts = url.split(":");
|
||||
if (parts[1].equalsIgnoreCase("solrconnector")) {
|
||||
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
|
||||
if (sfc.init(name, sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + parts[2])) {
|
||||
conlist.add(sfc);
|
||||
}
|
||||
} else {
|
||||
ConcurrentLog.config("FederateSearchManager", "Error in configuration of: " + url);
|
||||
}
|
||||
} else { // handle opensearch url template
|
||||
OpenSearchConnector osc = new OpenSearchConnector();
|
||||
if (osc.init(name, url)) {
|
||||
conlist.add(osc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
ConcurrentLog.logException(ex);
|
||||
}
|
||||
}
|
||||
manager = this; // reference for static access via .getManager()
|
||||
}
|
||||
|
||||
/**
|
||||
* Get instance of this manager. There should be only one instance running,
|
||||
* use this to get or initialize the manager.
|
||||
*
|
||||
* @return
|
||||
*/
|
||||
public static FederateSearchManager getManager() {
|
||||
if (manager == null) {
|
||||
manager = new FederateSearchManager(Switchboard.getSwitchboard());
|
||||
}
|
||||
return manager;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a query request to remote systems configured.
|
||||
* If search query domain is LOCAL procedure does nothing.
|
||||
*
|
||||
* @param theSearch
|
||||
*/
|
||||
public void search(SearchEvent theSearch) {
|
||||
if (theSearch != null) {
|
||||
if (!theSearch.query.isLocal()) {
|
||||
Set<AbstractFederateSearchConnector> picklist = getBest(theSearch.getQuery());
|
||||
for (AbstractFederateSearchConnector fsc : picklist) {
|
||||
fsc.search(theSearch);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sends a query to configured remote systems.
|
||||
*
|
||||
* @param query
|
||||
* @return list of results according to YaCy schema
|
||||
*/
|
||||
public List<URIMetadataNode> query(QueryParams query) {
|
||||
if (query.isLocal()) {
|
||||
List<URIMetadataNode> sdl = new ArrayList<URIMetadataNode>();
|
||||
Set<AbstractFederateSearchConnector> picklist = getBest(query);
|
||||
for (AbstractFederateSearchConnector fsc : picklist) {
|
||||
sdl.addAll(fsc.query(query));
|
||||
}
|
||||
return sdl;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a search string, converts it to queryparams and calls the
|
||||
* query(queryparams)
|
||||
*
|
||||
* @param querystr
|
||||
* @return SolrDocumentlist of remote query results according to YaCy schema
|
||||
*/
|
||||
public List<URIMetadataNode> query(String querystr) {
|
||||
|
||||
final QueryGoal qg = new QueryGoal(querystr);
|
||||
final Switchboard sb = Switchboard.getSwitchboard();
|
||||
Bitfield filter = new Bitfield();
|
||||
final QueryParams query = new QueryParams(
|
||||
qg,
|
||||
new QueryModifier(),
|
||||
Integer.MAX_VALUE,
|
||||
"",
|
||||
Classification.ContentDomain.ALL,
|
||||
"", //lang
|
||||
null,
|
||||
CacheStrategy.IFFRESH,
|
||||
100, 0, //count, offset
|
||||
".*", //urlmask
|
||||
null,
|
||||
null,
|
||||
QueryParams.Searchdom.LOCAL,
|
||||
filter,
|
||||
false,
|
||||
null,
|
||||
MultiProtocolURL.TLD_any_zone_filter,
|
||||
"",
|
||||
false,
|
||||
sb.index,
|
||||
sb.getRanking(),
|
||||
"",//userAgent
|
||||
false,
|
||||
false,
|
||||
0.0, 0.0, -1.0,
|
||||
new String[0]);
|
||||
|
||||
return query(query);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a search target system/connector to the config file
|
||||
*
|
||||
* @param urlTemplate query template url
|
||||
* @return successful added
|
||||
*/
|
||||
public boolean addOpenSearchTarget(String name, String urlTemplate, boolean active, String comment) {
|
||||
if (confFile == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
Configuration conf = new Configuration(confFile);
|
||||
if (name != null && !name.isEmpty()) {
|
||||
conf.add(name, null, active);
|
||||
Configuration.Entry e = conf.get(name);
|
||||
e.setValue(urlTemplate);
|
||||
e.setEnable(active);
|
||||
e.setComment(comment);
|
||||
conf.put(name, e);
|
||||
try {
|
||||
conf.commit();
|
||||
if (active) {
|
||||
OpenSearchConnector osd = new OpenSearchConnector();
|
||||
if (osd.init(name, urlTemplate)) {
|
||||
conlist.add(osd);
|
||||
}
|
||||
}
|
||||
} catch (final IOException ex) {
|
||||
ConcurrentLog.warn("FederateSearchManager", "config file write error");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
} catch (final IOException e1) {
|
||||
ConcurrentLog.logException(e1);
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of active remote query target systems
|
||||
*/
|
||||
public int getSize() {
|
||||
return conlist.size();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get best systems from configured targets for this search
|
||||
*
|
||||
* @param theSearch
|
||||
* @return list of searchtargetconnectors
|
||||
*/
|
||||
protected Set<AbstractFederateSearchConnector> getBest(final QueryParams query) {
|
||||
HashSet<AbstractFederateSearchConnector> retset = new HashSet<AbstractFederateSearchConnector>();
|
||||
// currently only enforces limits (min access delay, frequency)
|
||||
for (AbstractFederateSearchConnector fsc : conlist) {
|
||||
// check access time
|
||||
if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system
|
||||
retset.add(fsc);
|
||||
}
|
||||
}
|
||||
return retset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Discover opensearch description links from local (embedded) Solr index
|
||||
* using meta data field 'outboundlinks_tag_txt' and add found systems to
|
||||
* the config file
|
||||
*
|
||||
* @return true if background discover job was started, false if job not
|
||||
* started
|
||||
*/
|
||||
public boolean discoverFromSolrIndex(Switchboard sb) {
|
||||
if (sb == null) {
|
||||
return false;
|
||||
}
|
||||
// check if needed Solr fields are available (selected)
|
||||
if (!sb.index.fulltext().useWebgraph()) {
|
||||
ConcurrentLog.severe("FederateSearchManager", "Error on connecting to embedded Solr webgraph index");
|
||||
return false;
|
||||
}
|
||||
final SolrConnector connector = sb.index.fulltext().getWebgraphConnector();
|
||||
final boolean metafieldavailable = sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_rel_s.name())
|
||||
&& (sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_protocol_s.name()) && sb.index.fulltext().getWebgraphConfiguration().contains(WebgraphSchema.target_urlstub_s.name()))
|
||||
&& sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_WEBGRAPH, false);
|
||||
if (!metafieldavailable) {
|
||||
ConcurrentLog.warn("FederateSearchManager", "webgraph option and webgraph Schema fields target_rel_s, target_protocol_s and target_urlstub_s must be switched on");
|
||||
return false;
|
||||
}
|
||||
// the solr search
|
||||
final String webgraphquerystr = WebgraphSchema.target_rel_s.getSolrFieldName() + ":search";
|
||||
final String[] webgraphqueryfields = {WebgraphSchema.target_protocol_s.getSolrFieldName(), WebgraphSchema.target_urlstub_s.getSolrFieldName()};
|
||||
// alternatively target_protocol_s + "://" +target_host_s + target_path_s
|
||||
|
||||
final long numfound;
|
||||
try {
|
||||
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, 0, 1, webgraphqueryfields);
|
||||
numfound = docList.getNumFound();
|
||||
if (numfound == 0) {
|
||||
ConcurrentLog.info("FederateSearchManager", "no results found, abort discover job");
|
||||
return true;
|
||||
}
|
||||
ConcurrentLog.info("FederateSearchManager", "start checking " + Long.toString(numfound) + " found index results");
|
||||
} catch (final IOException ex) {
|
||||
ConcurrentLog.logException(ex);
|
||||
return false;
|
||||
}
|
||||
|
||||
final long stoptime = System.currentTimeMillis() + 1000 * 3600; // make sure job doesn't run forever
|
||||
|
||||
// job to iterate through Solr index to find links to opensearchdescriptions
|
||||
// started as background job as connect timeouts may cause it run a long time
|
||||
final Thread job = new Thread() {
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
boolean doloop = true;
|
||||
int loopnr = 0;
|
||||
Set<String> dblmem = new HashSet<String>(); // temp memory for already checked url
|
||||
while (doloop) {
|
||||
ConcurrentLog.info("FederateSearchManager", "start Solr query loop at " + Integer.toString(loopnr * 20) + " of " + Long.toString(numfound));
|
||||
SolrDocumentList docList = connector.getDocumentListByQuery(webgraphquerystr, null, loopnr * 20, 20, webgraphqueryfields); // check chunk of 20 result documents
|
||||
loopnr++;
|
||||
if (stoptime < System.currentTimeMillis()) {// stop after max 1h
|
||||
doloop = false;
|
||||
ConcurrentLog.info("FederateSearchManager", "long running discover task aborted");
|
||||
}
|
||||
if (docList != null && docList.size() > 0) {
|
||||
Iterator<SolrDocument> docidx = docList.iterator();
|
||||
while (docidx.hasNext()) {
|
||||
SolrDocument sdoc = docidx.next();
|
||||
|
||||
String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
|
||||
try {
|
||||
URL url = new URL(hrefurltxt);
|
||||
//TODO: check Blacklist
|
||||
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
|
||||
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
|
||||
if (os.getRSSorAtomUrl() != null) {
|
||||
// add found system to config file
|
||||
addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
|
||||
ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
|
||||
} else {
|
||||
ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
|
||||
}
|
||||
}
|
||||
} catch (final MalformedURLException ex) {
|
||||
}
|
||||
}
|
||||
} else {
|
||||
doloop = false;
|
||||
}
|
||||
}
|
||||
ConcurrentLog.info("FederateSearchManager", "finisched Solr query (checked " + Integer.toString(dblmem.size()) + " unique opensearchdescription links found in " + Long.toString(numfound) + " results)");
|
||||
} catch (final IOException ex) {
|
||||
ConcurrentLog.logException(ex);
|
||||
}
|
||||
}
|
||||
};
|
||||
job.start();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Read or reread opensearch config file and initialize connectors
|
||||
*
|
||||
* @param cfgFileName
|
||||
* @return true if successful
|
||||
*/
|
||||
public boolean init(String cfgFileName) {
|
||||
confFile = new File(cfgFileName);
|
||||
if (confFile.exists()) {
|
||||
try {
|
||||
cfg = new Configuration(confFile);
|
||||
if (!this.conlist.isEmpty()) this.conlist.clear(); // prevent double entries
|
||||
Iterator<Entry> it = cfg.entryIterator();
|
||||
while (it.hasNext()) {
|
||||
Entry cfgentry = it.next();
|
||||
if (cfgentry.enabled()) { // hold only enabled in memory
|
||||
String name = cfgentry.key();
|
||||
String url = cfgentry.getValue();
|
||||
if (url != null && !url.isEmpty()) {
|
||||
if (url.startsWith("cfgfile:")) { // is cfgfile with field mappings (no opensearch url)
|
||||
// config entry has 3 parts separated by : 1=cfgfile 2=connectortype 3=relative path to connector-cfg-file
|
||||
// example cfgfile:solrconnector:testsys.solr.schema
|
||||
String[] parts = url.split(":");
|
||||
if (parts[1].equalsIgnoreCase("solrconnector")) {
|
||||
SolrFederateSearchConnector sfc = new SolrFederateSearchConnector();
|
||||
if (sfc.init(name, confFile.getParent()+"/federatecfg/"+parts[2])) {
|
||||
conlist.add(sfc);
|
||||
}
|
||||
} else {
|
||||
ConcurrentLog.config("FederateSearchManager", "Init error in configuration of: " + url);
|
||||
}
|
||||
} else { // handle opensearch url template
|
||||
OpenSearchConnector osd;
|
||||
osd = new OpenSearchConnector();
|
||||
if (osd.init(name, url)) {
|
||||
conlist.add(osd);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
ConcurrentLog.logException(ex);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,119 @@
|
||||
/**
|
||||
* SolrFederateSearchConnector.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 19.01.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.cora.federate;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import net.yacy.cora.federate.solr.connector.RemoteSolrConnector;
|
||||
import net.yacy.cora.federate.solr.connector.SolrConnector;
|
||||
import net.yacy.cora.federate.solr.instance.RemoteInstance;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataNode;
|
||||
import net.yacy.search.query.QueryParams;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrException;
|
||||
import org.apache.solr.common.params.CommonParams;
|
||||
import org.apache.solr.common.params.ModifiableSolrParams;
|
||||
|
||||
/**
|
||||
* Search connecter to collect query results from remote Solr systems which
|
||||
* provide results as Solr documents
|
||||
*/
|
||||
public class SolrFederateSearchConnector extends AbstractFederateSearchConnector {
|
||||
|
||||
private String corename;
|
||||
|
||||
@Override
|
||||
public boolean init(String instance, String cfgFileName) {
|
||||
boolean initResult = super.init(instance, cfgFileName); // init local schema cfg
|
||||
if (initResult) {
|
||||
if (this.localcfg.contains("_baseurl")) {
|
||||
setBaseurl(this.localcfg.get("_baseurl").getValue());
|
||||
} else {
|
||||
ConcurrentLog.config(instance, "no _baseurl given in config file "+cfgFileName);
|
||||
initResult = false;
|
||||
}
|
||||
if (this.localcfg.contains("_corename")) {
|
||||
setCoreName(this.localcfg.get("_corename").getValue());
|
||||
} else {
|
||||
ConcurrentLog.config(instance, "no _corename given in config file "); // not mandatory
|
||||
this.corename = "";
|
||||
}
|
||||
}
|
||||
return initResult;
|
||||
}
|
||||
|
||||
public void setBaseurl(String url) {
|
||||
if (url.endsWith("/")) {
|
||||
this.baseurl = url;
|
||||
} else {
|
||||
this.baseurl = url + "/";
|
||||
}
|
||||
}
|
||||
|
||||
public void setCoreName(String core) {
|
||||
this.corename = core;
|
||||
}
|
||||
|
||||
/**
|
||||
* Core query implementation
|
||||
* all query and search routines will use this routine to query the remote system
|
||||
*
|
||||
* @param query
|
||||
* @return list of solr documents (metadata) accordng to local YaCy internal schema
|
||||
*/
|
||||
@Override
|
||||
public List<URIMetadataNode> query(QueryParams query) {
|
||||
|
||||
List<URIMetadataNode> docs = new ArrayList<URIMetadataNode>();
|
||||
Collection<String> remotecorename = new ArrayList<String>();
|
||||
remotecorename.add(corename);
|
||||
ModifiableSolrParams msp = new SolrQuery(query.getQueryGoal().getQueryString(false));
|
||||
msp.add(CommonParams.QT, "/"); // important to override default append of /select
|
||||
msp.add(CommonParams.ROWS, Integer.toString(query.itemsPerPage));
|
||||
try {
|
||||
RemoteInstance instance = new RemoteInstance(baseurl, remotecorename, corename, 20000);
|
||||
try {
|
||||
SolrConnector solrConnector = new RemoteSolrConnector(instance, false, null);
|
||||
try {
|
||||
this.lastaccesstime = System.currentTimeMillis();
|
||||
SolrDocumentList docList = solrConnector.getDocumentListByParams(msp);
|
||||
// convert to YaCy schema documentlist
|
||||
for (SolrDocument doc : docList) {
|
||||
URIMetadataNode anew = toYaCySchema(doc);
|
||||
docs.add(anew);
|
||||
}
|
||||
} catch (IOException | SolrException e) {
|
||||
} finally {
|
||||
solrConnector.close();
|
||||
}
|
||||
} catch (Throwable ee) {
|
||||
} finally {
|
||||
instance.close();
|
||||
}
|
||||
} catch (IOException eee) {
|
||||
}
|
||||
return docs;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue