Added robots.txt support for heuristics federated search.

As noticed by @reger24, abusive use of OpenSearch systems should be
prevented, especially if allowing to parse and reuse HTML results.
robots.txt file is now checked before requesting an external OpenSearch
system to respect the host exclusions and eventual crawl-delay value.
The check is also performed when trying to add a new OpenSearch URL
template through the /ConfigHeuristics_p.html admin page.
pull/110/head
luccioman 8 years ago
parent 7e6e14a406
commit 6e89d125f2

@ -28,14 +28,19 @@
import java.io.File;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.data.WorkTables;
import net.yacy.search.Switchboard;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Iterator;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.FederateSearchManager;
import net.yacy.cora.federate.solr.SchemaConfiguration;
@ -97,11 +102,31 @@ public class ConfigHeuristics_p {
final String tmpname = post.get("ossys_newtitle");
if (tmpname != null && tmpurl !=null) {
if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
final String tmpcomment = post.get("ossys_newcomment");
FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment);
} else osderrmsg = "Url template must contain '{searchTerms}'";
/* Check eventual robots.txt policy */
RobotsTxtEntry robotsEntry = null;
try {
MultiProtocolURL templateURL = new MultiProtocolURL(tmpurl);
if (sb.robots != null) {
robotsEntry = sb.robots.getEntry(templateURL,
ClientIdentification.yacyInternetCrawlerAgent);
}
if (robotsEntry != null && robotsEntry.isDisallowed(templateURL)) {
osderrmsg = "URL template is disallowed by the host robots.xt";
} else {
final String tmpcomment = post.get("ossys_newcomment");
FederateSearchManager.getManager().addOpenSearchTarget(tmpname, tmpurl, false,
tmpcomment);
}
} catch (final MalformedURLException ex) {
osderrmsg = "URL template is malformed.";
}
} else {
osderrmsg = "Url template must contain '{searchTerms}'";
}
}
}
if (post.containsKey("setopensearch")) {
// read index schema table flags

@ -37,10 +37,12 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.Configuration;
import net.yacy.cora.storage.Configuration.Entry;
import net.yacy.cora.storage.Files;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.robots.RobotsTxtEntry;
import net.yacy.document.parser.xml.opensearchdescriptionReader;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.util.Bitfield;
@ -58,13 +60,26 @@ import net.yacy.search.schema.WebgraphSchema;
*/
public class FederateSearchManager {
private final int accessDelay = 15000; // delay between connects (in ms)
/** Delay between connects (in ms) */
private final int accessDelay = 15000;
private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
private HashSet<AbstractFederateSearchConnector> conlist; // connector list
protected Configuration cfg;//PropertiesConfiguration cfg;
private static FederateSearchManager manager = null; // self referenc for static .getManager()
/** Connectors list */
private HashSet<AbstractFederateSearchConnector> conlist;
/** PropertiesConfiguration cfg */
protected Configuration cfg;
/** Switchboard instance */
private Switchboard switchboard;
/** Self reference for static .getManager() */
private static FederateSearchManager manager = null;
/**
* @param sb switchboard instance. Must not be null.
*/
public FederateSearchManager(Switchboard sb) {
super();
this.conlist = new HashSet<AbstractFederateSearchConnector>();
@ -73,6 +88,7 @@ public class FederateSearchManager {
if (sb == null) {
return;
}
this.switchboard = sb;
// Data needed active name, url(template), desc, rule-when-to-use, specifics
confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (!confFile.exists()) {
@ -180,7 +196,6 @@ public class FederateSearchManager {
public List<URIMetadataNode> query(String querystr) {
final QueryGoal qg = new QueryGoal(querystr);
final Switchboard sb = Switchboard.getSwitchboard();
Bitfield filter = new Bitfield();
final QueryParams query = new QueryParams(
qg,
@ -203,8 +218,8 @@ public class FederateSearchManager {
MultiProtocolURL.TLD_any_zone_filter,
"",
false,
sb.index,
sb.getRanking(),
this.switchboard.index,
this.switchboard.getRanking(),
"",//userAgent
0.0d, 0.0d, 0.0d,
new String[0]);
@ -236,11 +251,7 @@ public class FederateSearchManager {
conf.commit();
if (active) {
OpenSearchConnector osd = new OpenSearchConnector(urlTemplate);
String htmlMappingFile = null;
Switchboard sb = Switchboard.getSwitchboard();
if(sb != null) {
htmlMappingFile = sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
}
String htmlMappingFile = this.switchboard.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
if (osd.init(name, htmlMappingFile)) {
conlist.add(osd);
}
@ -272,12 +283,38 @@ public class FederateSearchManager {
*/
protected Set<AbstractFederateSearchConnector> getBest(final QueryParams query) {
HashSet<AbstractFederateSearchConnector> retset = new HashSet<AbstractFederateSearchConnector>();
// currently only enforces limits (min access delay, frequency)
MultiProtocolURL connectorURL;
for (AbstractFederateSearchConnector fsc : conlist) {
try {
connectorURL = new MultiProtocolURL(fsc.baseurl);
} catch (MalformedURLException e) {
ConcurrentLog.warn("FederateSearchManager", "Malformed connector URL : " + fsc.baseurl);
continue;
}
RobotsTxtEntry robotsEntry = null;
int robotsDelay = 0;
if (this.switchboard != null && this.switchboard.robots != null) {
robotsEntry = this.switchboard.robots.getEntry(connectorURL,
ClientIdentification.yacyInternetCrawlerAgent);
if(robotsEntry != null) {
robotsDelay = robotsEntry.getCrawlDelayMillis();
}
}
// check access time
if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system
retset.add(fsc);
long currentTime = System.currentTimeMillis();
if ((fsc.lastaccesstime + accessDelay < currentTime)
&& (fsc.lastaccesstime + robotsDelay < currentTime) ) {
// enforce 15 sec delay between searches to same system, and also check any eventual robots.txt Crawl-delay directive
if (robotsEntry == null || !robotsEntry.isDisallowed(connectorURL)) {
// also check robots.txt exclusion
retset.add(fsc);
} else {
ConcurrentLog.warn("FederateSearchManager",
"Connector URL is disallowed by robots.txt : " + fsc.baseurl);
}
}
}
return retset;
}
@ -290,7 +327,7 @@ public class FederateSearchManager {
* @return true if background discover job was started, false if job not
* started
*/
public boolean discoverFromSolrIndex(Switchboard sb) {
public boolean discoverFromSolrIndex(final Switchboard sb) {
if (sb == null) {
return false;
}
@ -351,20 +388,40 @@ public class FederateSearchManager {
SolrDocument sdoc = docidx.next();
String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
URL url;
try {
URL url = new URL(hrefurltxt);
//TODO: check Blacklist
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
if (os.getRSSorAtomUrl() != null) {
// add found system to config file
url = new URL(hrefurltxt);
} catch (final MalformedURLException ex) {
ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
continue;
}
//TODO: check Blacklist
if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
if (os.getRSSorAtomUrl() != null) {
/* Check eventual robots.txt policy */
RobotsTxtEntry robotsEntry = null;
MultiProtocolURL templateURL;
try {
templateURL = new MultiProtocolURL(os.getRSSorAtomUrl());
} catch (final MalformedURLException ex) {
ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
continue;
}
if(sb.robots != null) {
robotsEntry = sb.robots.getEntry(templateURL, ClientIdentification.yacyInternetCrawlerAgent);
}
if(robotsEntry != null && robotsEntry.isDisallowed(templateURL)) {
ConcurrentLog.info("FederateSearchManager", "OpenSearch description template URL is disallowed by robots.xt");
} else {
// add found system to config file
addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
} else {
ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
}
}
} else {
ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
}
} catch (final MalformedURLException ex) {
}
}
} else {

Loading…
Cancel
Save