Added robots.txt support for heuristics federated search.

As noticed by @reger24, abusive use of OpenSearch systems should be prevented, especially if allowing to parse and reuse HTML results. robots.txt file is now checked before requesting an external OpenSearch system to respect the host exclusions and eventual crawl-delay value. The check is also performed when trying to add a new OpenSearch URL template through the /ConfigHeuristics_p.html admin page.
8 years ago · 6e89d125f2
parent 7e6e14a406
commit 6e89d125f2
2 changed files with 111 additions and 29 deletions
--- a/htroot/ConfigHeuristics_p.java
+++ b/htroot/ConfigHeuristics_p.java
@ -28,14 +28,19 @@

 import java.io.File;

+import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.storage.Configuration;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.robots.RobotsTxtEntry;
 import net.yacy.data.WorkTables;
 import net.yacy.search.Switchboard;

 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.util.Iterator;
+
+import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.FederateSearchManager;

 import net.yacy.cora.federate.solr.SchemaConfiguration;
@ -97,11 +102,31 @@ public class ConfigHeuristics_p {
                final String tmpname = post.get("ossys_newtitle");
                if (tmpname != null && tmpurl !=null) {
                    if (!tmpname.isEmpty() && !tmpurl.isEmpty() && tmpurl.toLowerCase().contains("{searchterms}")) {
-                        final String tmpcomment = post.get("ossys_newcomment");
-                        FederateSearchManager.getManager().addOpenSearchTarget(tmpname,tmpurl,false,tmpcomment);
-                    } else osderrmsg = "Url template must contain '{searchTerms}'";
+                     	/* Check eventual robots.txt policy */
+                      	RobotsTxtEntry robotsEntry = null;
+						try {
+							MultiProtocolURL templateURL = new MultiProtocolURL(tmpurl);
+
+							if (sb.robots != null) {
+								robotsEntry = sb.robots.getEntry(templateURL,
+										ClientIdentification.yacyInternetCrawlerAgent);
+							}
+
+							if (robotsEntry != null && robotsEntry.isDisallowed(templateURL)) {
+								osderrmsg = "URL template is disallowed by the host robots.xt";
+							} else {
+								final String tmpcomment = post.get("ossys_newcomment");
+								FederateSearchManager.getManager().addOpenSearchTarget(tmpname, tmpurl, false,
+										tmpcomment);
+							}
+						} catch (final MalformedURLException ex) {
+							osderrmsg = "URL template is malformed.";
+						}
+                    } else {
+                    	osderrmsg = "Url template must contain '{searchTerms}'";
                    }
                }
+            }

            if (post.containsKey("setopensearch")) {
                // read index schema table flags
--- a/source/net/yacy/cora/federate/FederateSearchManager.java
+++ b/source/net/yacy/cora/federate/FederateSearchManager.java
@ -37,10 +37,12 @@ import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.federate.opensearch.OpenSearchConnector;
 import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.federate.yacy.CacheStrategy;
+import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.storage.Configuration;
 import net.yacy.cora.storage.Configuration.Entry;
 import net.yacy.cora.storage.Files;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.robots.RobotsTxtEntry;
 import net.yacy.document.parser.xml.opensearchdescriptionReader;
 import net.yacy.kelondro.data.meta.URIMetadataNode;
 import net.yacy.kelondro.util.Bitfield;
@ -58,13 +60,26 @@ import net.yacy.search.schema.WebgraphSchema;
 */
 public class FederateSearchManager {

-    private final int accessDelay = 15000; // delay between connects (in ms)
+	/** Delay between connects (in ms) */
+    private final int accessDelay = 15000;

    private File confFile = null; // later initialized to DATA/SETTINGS/heuristicopensearch.conf
-    private HashSet<AbstractFederateSearchConnector> conlist; // connector list
-    protected Configuration cfg;//PropertiesConfiguration cfg;
-    private static FederateSearchManager manager = null; // self referenc for static .getManager()
+    
+    /** Connectors list */
+    private HashSet<AbstractFederateSearchConnector> conlist;
+    
+    /** PropertiesConfiguration cfg */
+    protected Configuration cfg;
+    
+    /** Switchboard instance */
+    private Switchboard switchboard;
+    
+    /** Self reference for static .getManager() */
+    private static FederateSearchManager manager = null;

+    /**
+     * @param sb switchboard instance. Must not be null.
+     */
    public FederateSearchManager(Switchboard sb) {
        super();
        this.conlist = new HashSet<AbstractFederateSearchConnector>();
@ -73,6 +88,7 @@ public class FederateSearchManager {
        if (sb == null) {
            return;
        }
+        this.switchboard = sb;
        // Data needed  active  name, url(template), desc, rule-when-to-use, specifics
        confFile = new File(sb.getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
        if (!confFile.exists()) {
@ -180,7 +196,6 @@ public class FederateSearchManager {
    public List<URIMetadataNode> query(String querystr) {

        final QueryGoal qg = new QueryGoal(querystr);
-        final Switchboard sb = Switchboard.getSwitchboard();
        Bitfield filter = new Bitfield();
        final QueryParams query = new QueryParams(
                qg,
@ -203,8 +218,8 @@ public class FederateSearchManager {
                MultiProtocolURL.TLD_any_zone_filter,
                "",
                false,
-                sb.index,
-                sb.getRanking(),
+                this.switchboard.index,
+                this.switchboard.getRanking(),
                "",//userAgent
                0.0d, 0.0d, 0.0d,
                new String[0]);
@ -236,11 +251,7 @@ public class FederateSearchManager {
                    conf.commit();
                    if (active) {
                        OpenSearchConnector osd = new OpenSearchConnector(urlTemplate);
-                        String htmlMappingFile = null;
-                        Switchboard sb = Switchboard.getSwitchboard();
-                        if(sb != null) {
-                        	htmlMappingFile = sb.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
-                        }
+                        String htmlMappingFile = this.switchboard.getDataPath()+ "/DATA/SETTINGS/federatecfg/" + OpenSearchConnector.htmlMappingFileName(name);
                        if (osd.init(name, htmlMappingFile)) {
                            conlist.add(osd);
                        }
@ -272,12 +283,38 @@ public class FederateSearchManager {
     */
    protected Set<AbstractFederateSearchConnector> getBest(final QueryParams query) {
        HashSet<AbstractFederateSearchConnector> retset = new HashSet<AbstractFederateSearchConnector>();
-        // currently only enforces limits (min access delay, frequency)
+        MultiProtocolURL connectorURL;
        for (AbstractFederateSearchConnector fsc : conlist) {
+        	try {
+				connectorURL = new MultiProtocolURL(fsc.baseurl);
+			} catch (MalformedURLException e) {
+				ConcurrentLog.warn("FederateSearchManager", "Malformed connector URL : " + fsc.baseurl);
+				continue;
+			}
+        	RobotsTxtEntry robotsEntry = null;
+        	int robotsDelay = 0;
+			if (this.switchboard != null && this.switchboard.robots != null) {
+				robotsEntry = this.switchboard.robots.getEntry(connectorURL,
+						ClientIdentification.yacyInternetCrawlerAgent);
+				if(robotsEntry != null) {
+					robotsDelay = robotsEntry.getCrawlDelayMillis();
+				}
+			}
+        	
            // check access time
-            if (fsc.lastaccesstime + accessDelay < System.currentTimeMillis()) { // enforce 15 sec delay between searches to same system
-                retset.add(fsc);
+			long currentTime = System.currentTimeMillis();
+            if ((fsc.lastaccesstime + accessDelay < currentTime) 
+            		&& (fsc.lastaccesstime + robotsDelay < currentTime) ) { 
+            	// enforce 15 sec delay between searches to same system, and also check any eventual robots.txt Crawl-delay directive
+    			if (robotsEntry == null || !robotsEntry.isDisallowed(connectorURL)) {
+                    // also check robots.txt exclusion
+    				retset.add(fsc);
+    			} else {
+    				ConcurrentLog.warn("FederateSearchManager",
+    						"Connector URL is disallowed by robots.txt : " + fsc.baseurl);
+    			}
            }
+
        }
        return retset;
    }
@ -290,7 +327,7 @@ public class FederateSearchManager {
     * @return true if background discover job was started, false if job not
     * started
     */
-    public boolean discoverFromSolrIndex(Switchboard sb) {
+    public boolean discoverFromSolrIndex(final Switchboard sb) {
        if (sb == null) {
            return false;
        }
@ -351,20 +388,40 @@ public class FederateSearchManager {
                                SolrDocument sdoc = docidx.next();

                                String hrefurltxt = sdoc.getFieldValue(WebgraphSchema.target_protocol_s.getSolrFieldName()) + "://" + sdoc.getFieldValue(WebgraphSchema.target_urlstub_s.getSolrFieldName());
+                                URL url;
                                try {
-                                    URL url = new URL(hrefurltxt);
-                                    //TODO: check Blacklist
-                                    if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
-                                        opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
-                                        if (os.getRSSorAtomUrl() != null) {
-                                            // add found system to config file
+                                    url = new URL(hrefurltxt);
+                                } catch (final MalformedURLException ex) {
+                                	ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
+                                	continue;
+                                }
+                                //TODO: check Blacklist
+                                if (dblmem.add(url.getAuthority())) { // use only main path to detect double entries
+                                    opensearchdescriptionReader os = new opensearchdescriptionReader(hrefurltxt);
+                                    if (os.getRSSorAtomUrl() != null) {
+                                     	/* Check eventual robots.txt policy */
+                                      	RobotsTxtEntry robotsEntry = null;
+                                      	MultiProtocolURL templateURL;
+                                       	try {
+                                       		templateURL = new MultiProtocolURL(os.getRSSorAtomUrl());
+                                       	} catch (final MalformedURLException ex) {
+                                           	ConcurrentLog.warn("FederateSearchManager", "OpenSearch description URL is malformed : " + hrefurltxt);
+                                           	continue;
+                                        }
+                                       	if(sb.robots != null) {
+                                       		robotsEntry = sb.robots.getEntry(templateURL, ClientIdentification.yacyInternetCrawlerAgent);
+                                       	}
+
+                                   		if(robotsEntry != null && robotsEntry.isDisallowed(templateURL)) {
+                                   			ConcurrentLog.info("FederateSearchManager", "OpenSearch description template URL is disallowed by robots.xt");
+                                   		} else {
+                                   			// add found system to config file
                                            addOpenSearchTarget(os.getShortName(), os.getRSSorAtomUrl(), false, os.getItem("LongName"));
                                            ConcurrentLog.info("FederateSearchManager", "added " + os.getShortName() + " " + hrefurltxt);
-                                        } else {
-                                            ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
-                                        }
+                                    	}
+                                    } else {
+                                    	ConcurrentLog.info("FederateSearchManager", "osd.xml check failed (no RSS or Atom support) for " + hrefurltxt);
                                    }
-                                } catch (final MalformedURLException ex) {
                                }
                            }
                        } else {