New optional crawl filter on the URL a doc must match to crawl its links

For finer control over which parsed documents can trigger an addition of their links to the crawl stack, complementary to the existing crawl depth parameter.
6 years ago · 6b45cd5799
parent 8d3e029247
commit 6b45cd5799
6 changed files with 207 additions and 86 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -23,6 +23,8 @@
 		<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
 		<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
 		<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
+		<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
+		<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
 		<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
 		<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
 		<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -174,6 +174,7 @@

                // remove if MATCH_NEVER_STRING
                disableIf('mustnotmatch', defaultMatchNone);
+                disableIf('crawlerOriginURLMustNotMatch', defaultMatchNone);
                disableIf('ipMustnotmatch', defaultMatchNone);
                disableIf('indexmustnotmatch', defaultMatchNone);
                disableIf('indexcontentmustnotmatch', defaultMatchNone);
@ -183,6 +184,7 @@

                // remove if MATCH_ALL_STRING
                disableIf('mustmatch', defaultMatchAll);
+                disableIf('crawlerOriginURLMustMatch', defaultMatchAll);
                disableIf('ipMustmatch', defaultMatchAll);
                disableIf('indexmustmatch', defaultMatchAll);
                disableIf('indexcontentmustmatch', defaultMatchAll);
@ -354,6 +356,29 @@
            <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="mustnotmatch" id="mustnotmatch" type="text" size="55" maxlength="100000" value="#[mustnotmatch]#" /></td></tr>
 			</table>
 	        </dd>
+	        
+	        <dt>Load Filter on URL origin of links</dt>
+	        <dd>
+	        	<span class="info" style="float:right">
+	        		<img src="env/grafics/i16.gif" width="16" height="16" alt="info"/>
+	        		<span style="right:0px;">
+            			The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.
+            			Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'. 
+            			Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
+            		</span>
+            	</span>
+            	<table style="border-width: 0px">
+            		<tr>
+            			<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
+            			<td><input name="crawlerOriginURLMustMatch" id="crawlerOriginURLMustMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustMatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
+            		</tr>
+		    		<tr>
+		    			<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
+		    			<td><input name="crawlerOriginURLMustNotMatch" id="crawlerOriginURLMustNotMatch" type="text" size="55" maxlength="100000" value="#[crawlerOriginURLMustNotMatch]#" /></td>
+		    		</tr>
+				</table>
+	        </dd>
+	        
 	        <dt>Load Filter on IPs</dt>
 	        <dd>
            <table style="border-width: 0px">
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -249,6 +249,22 @@ public class CrawlStartExpert {
            prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        }
        
+		// Filter on URL origin of links: must match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+		}
+
+		// Filter on URL origin of links: must-not-match
+		if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key,
+					post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+		} else {
+			prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
+		}
+
        // Load Filter on IPs: must match
        if (post != null && post.containsKey("ipMustmatch")) {
            prop.put("ipMustmatch", post.get("ipMustmatch", ""));
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -627,6 +627,10 @@ public class Crawler_p {
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);

+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key,
+							post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+					profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post
+							.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
 							post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
 					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -99,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
        CRAWLER_URL_MUSTMATCH        ("crawlerURLMustMatch",        false, CrawlAttribute.STRING,  "URL Must-Match Filter"),
        CRAWLER_URL_MUSTNOTMATCH     ("crawlerURLMustNotMatch",     false, CrawlAttribute.STRING,  "URL Must-Not-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTMATCH ("crawlerOriginURLMustMatch",  false, CrawlAttribute.STRING,  "Links Origin URL Must-Match Filter"),
+        CRAWLER_ORIGIN_URL_MUSTNOTMATCH ("crawlerOriginURLMustNotMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Not-Match Filter"),
        CRAWLER_IP_MUSTMATCH         ("crawlerIPMustMatch",         false, CrawlAttribute.STRING,  "IP Must-Match Filter"),
        CRAWLER_IP_MUSTNOTMATCH      ("crawlerIPMustNotMatch",      false, CrawlAttribute.STRING,  "IP Must-Not-Match Filter"),
        CRAWLER_COUNTRY_MUSTMATCH    ("crawlerCountryMustMatch",    false, CrawlAttribute.STRING,  "Country Must-Match Filter"),
@ -148,6 +150,13 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    
    
    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
+    
+    /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustMatch = null;
+    
+    /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */
+    private Pattern crawlerOriginUrlMustNotMatch = null;
+    
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
@ -243,6 +252,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
        put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key,     (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
+        put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
+        put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key,  (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key,      (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
        put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key,   (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch);
        put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch);
@ -502,6 +513,50 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.crawlerurlmustnotmatch;
    }
    
+	/**
+	 * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustMatchPattern() {
+		if (this.crawlerOriginUrlMustMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustMatch;
+    }
+    
+	/**
+	 * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getCrawlerOriginUrlMustNotMatchPattern() {
+		if (this.crawlerOriginUrlMustNotMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key);
+			try {
+				this.crawlerOriginUrlMustNotMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+			}
+		}
+        return this.crawlerOriginUrlMustNotMatch;
+    }
+
    /**
     * Gets the regex which must be matched by IPs in order to be crawled.
     * @return regex which must be matched
@ -926,6 +981,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key));
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3111,7 +3111,23 @@ public final class Switchboard extends serverSwitch {
            )
           ) {
            
-            for (Document d: documents) d.setDepth(response.depth());
+			final Pattern crawlerOriginUrlMustMatch = response.profile().getCrawlerOriginUrlMustMatchPattern();
+			final Pattern crawlerOriginUrlMustNotMatch = response.profile().getCrawlerOriginUrlMustNotMatchPattern();
+			if (!(crawlerOriginUrlMustMatch == CrawlProfile.MATCH_ALL_PATTERN
+					|| crawlerOriginUrlMustMatch.matcher(response.url().toNormalform(true)).matches())
+					|| (crawlerOriginUrlMustNotMatch != CrawlProfile.MATCH_NEVER_PATTERN
+							&& crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) {
+				if (this.log.isInfo()) {
+					this.log.info("CRAWL: Ignored links from document at " + response.url().toNormalform(true)
+							+ " : prevented by regular expression on URL origin of links, "
+							+ CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH + " = " + crawlerOriginUrlMustMatch.pattern()
+							+ ", " + CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH + " = "
+							+ crawlerOriginUrlMustNotMatch.pattern());
+	            }
+			} else {
+                for (Document d: documents) {
+                	d.setDepth(response.depth());
+                }
                
                // get the hyperlinks
                final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
@ -3202,6 +3218,7 @@ public final class Switchboard extends serverSwitch {
                        + (parsingEndTime - parsingStartTime));
                }            	
            }
+        }
        return documents;
    }