From 6b45cd579922574059e5385153b84be3ca07533b Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 1 May 2019 08:54:19 +0200 Subject: [PATCH] New optional crawl filter on the URL a doc must match to crawl its links For finer control over which parsed documents can trigger an addition of their links to the crawl stack, complementary to the existing crawl depth parameter. --- htroot/CrawlProfileEditor_p.xml | 2 + htroot/CrawlStartExpert.html | 25 +++ htroot/CrawlStartExpert.java | 16 ++ htroot/Crawler_p.java | 6 +- .../net/yacy/crawler/data/CrawlProfile.java | 57 ++++++ source/net/yacy/search/Switchboard.java | 187 ++++++++++-------- 6 files changed, 207 insertions(+), 86 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 4ab2b2534..0b880ac3f 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -23,6 +23,8 @@ #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)# #[crawlerURLMustMatch]# #[crawlerURLMustNotMatch]# + #[crawlerOriginURLMustMatch]# + #[crawlerOriginURLMustNotMatch]# #[crawlerIPMustMatch]# #[crawlerIPMustNotMatch]# #[crawlerCountryMustMatch]# diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 909b45680..7c644ac11 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -174,6 +174,7 @@ // remove if MATCH_NEVER_STRING disableIf('mustnotmatch', defaultMatchNone); + disableIf('crawlerOriginURLMustNotMatch', defaultMatchNone); disableIf('ipMustnotmatch', defaultMatchNone); disableIf('indexmustnotmatch', defaultMatchNone); disableIf('indexcontentmustnotmatch', defaultMatchNone); @@ -183,6 +184,7 @@ // remove if MATCH_ALL_STRING disableIf('mustmatch', defaultMatchAll); + disableIf('crawlerOriginURLMustMatch', defaultMatchAll); disableIf('ipMustmatch', defaultMatchAll); disableIf('indexmustmatch', defaultMatchAll); disableIf('indexcontentmustmatch', defaultMatchAll); @@ -354,6 +356,29 @@ must-not-match + +
Load Filter on URL origin of links
+
+ + info + + The filter is a regular expression. + Example: to allow loading only links from pages on example.org domain, set the must-match filter to '.*example.org.*'. + Attention: you can test the functionality of your regular expressions using the Regular Expression Tester within YaCy. + + + + + + + + + + + +
must-match (must not be empty)
must-not-match
+
+
Load Filter on IPs
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 54cc6b234..d73b70c43 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -248,6 +248,22 @@ public class CrawlStartExpert { } else { prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); } + + // Filter on URL origin of links: must match + if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)) { + prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, + post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + } else { + prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING); + } + + // Filter on URL origin of links: must-not-match + if (post != null && post.containsKey(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)) { + prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, + post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + } else { + prop.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); + } // Load Filter on IPs: must match if (post != null && post.containsKey("ipMustmatch")) { diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 343ecfed5..0e5bd8e74 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -626,7 +626,11 @@ public class Crawler_p { ignoreclassname, new VocabularyScraper(vocabulary_scraper), timezoneOffset); - + + profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, + post.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + profile.put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, post + .get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index ded1b764b..ae264b6ed 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -99,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"), CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"), CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"), + CRAWLER_ORIGIN_URL_MUSTMATCH ("crawlerOriginURLMustMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Match Filter"), + CRAWLER_ORIGIN_URL_MUSTNOTMATCH ("crawlerOriginURLMustNotMatch", false, CrawlAttribute.STRING, "Links Origin URL Must-Not-Match Filter"), CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"), CRAWLER_IP_MUSTNOTMATCH ("crawlerIPMustNotMatch", false, CrawlAttribute.STRING, "IP Must-Not-Match Filter"), CRAWLER_COUNTRY_MUSTMATCH ("crawlerCountryMustMatch", false, CrawlAttribute.STRING, "Country Must-Match Filter"), @@ -148,6 +150,13 @@ public class CrawlProfile extends ConcurrentHashMap implements M private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; + + /** Pattern on the URL a document must match to allow adding its embedded links to the crawl stack */ + private Pattern crawlerOriginUrlMustMatch = null; + + /** Pattern on the URL a document must not match to allow adding its embedded links to the crawl stack */ + private Pattern crawlerOriginUrlMustNotMatch = null; + private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; @@ -243,6 +252,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true); put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); + put(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); + put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); put(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key, (crawlerIpMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerIpMustNotMatch); put(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key, (crawlerCountryMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerCountryMustMatch); @@ -501,6 +512,50 @@ public class CrawlProfile extends ConcurrentHashMap implements M } return this.crawlerurlmustnotmatch; } + + /** + * Get the pattern on the URL a document must match to allow adding its embedded links to the crawl stack + * + * @return a {@link Pattern} instance, defaulting to + * {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression + * string is not set or its syntax is incorrect + */ + public Pattern getCrawlerOriginUrlMustMatchPattern() { + if (this.crawlerOriginUrlMustMatch == null) { + /* Cache the compiled pattern for faster next calls */ + final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key); + try { + this.crawlerOriginUrlMustMatch = (patternStr == null + || patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN + : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); + } catch (final PatternSyntaxException e) { + this.crawlerOriginUrlMustMatch = CrawlProfile.MATCH_ALL_PATTERN; + } + } + return this.crawlerOriginUrlMustMatch; + } + + /** + * Get the pattern on the URL a document must not match to allow adding its embedded links to the crawl stack + * + * @return a {@link Pattern} instance, defaulting to + * {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression + * string is not set or its syntax is incorrect + */ + public Pattern getCrawlerOriginUrlMustNotMatchPattern() { + if (this.crawlerOriginUrlMustNotMatch == null) { + /* Cache the compiled pattern for faster next calls */ + final String patternStr = get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key); + try { + this.crawlerOriginUrlMustNotMatch = (patternStr == null + || patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN + : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); + } catch (final PatternSyntaxException e) { + this.crawlerOriginUrlMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN; + } + } + return this.crawlerOriginUrlMustNotMatch; + } /** * Gets the regex which must be matched by IPs in order to be crawled. @@ -926,6 +981,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerOriginURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustNotMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerCountryMustMatch", this.get(CrawlAttribute.CRAWLER_COUNTRY_MUSTMATCH.key)); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index f436b6e66..218f7b8f5 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3111,95 +3111,112 @@ public final class Switchboard extends serverSwitch { ) ) { - for (Document d: documents) d.setDepth(response.depth()); - - // get the hyperlinks - final Map hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow()); - - final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */ - || response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */; - - /* Handle media links */ - - for (Map.Entry entry : Document.getImagelinks(documents).entrySet()) { - if (addAllLinksToCrawlStack - || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { - hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - } - - for (Map.Entry entry : Document.getApplinks(documents).entrySet()) { - if (addAllLinksToCrawlStack - || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { - hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - } - - for (Map.Entry entry : Document.getVideolinks(documents).entrySet()) { - if (addAllLinksToCrawlStack - || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { - hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - } - - for (Map.Entry entry : Document.getAudiolinks(documents).entrySet()) { - if (addAllLinksToCrawlStack - || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { - hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - } - - // insert those hyperlinks to the crawler - MultiProtocolURL nextUrl; - for ( final Map.Entry nextEntry : hl.entrySet() ) { - // check for interruption - checkInterruption(); - - // process the next hyperlink - nextUrl = nextEntry.getKey(); - String u = nextUrl.toNormalform(true, true); - if ( !(u.startsWith("http://") - || u.startsWith("https://") - || u.startsWith("ftp://") - || u.startsWith("smb://") || u.startsWith("file://")) ) { - continue; + final Pattern crawlerOriginUrlMustMatch = response.profile().getCrawlerOriginUrlMustMatchPattern(); + final Pattern crawlerOriginUrlMustNotMatch = response.profile().getCrawlerOriginUrlMustNotMatchPattern(); + if (!(crawlerOriginUrlMustMatch == CrawlProfile.MATCH_ALL_PATTERN + || crawlerOriginUrlMustMatch.matcher(response.url().toNormalform(true)).matches()) + || (crawlerOriginUrlMustNotMatch != CrawlProfile.MATCH_NEVER_PATTERN + && crawlerOriginUrlMustNotMatch.matcher(response.url().toNormalform(true)).matches())) { + if (this.log.isInfo()) { + this.log.info("CRAWL: Ignored links from document at " + response.url().toNormalform(true) + + " : prevented by regular expression on URL origin of links, " + + CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTMATCH + " = " + crawlerOriginUrlMustMatch.pattern() + + ", " + CrawlAttribute.CRAWLER_ORIGIN_URL_MUSTNOTMATCH + " = " + + crawlerOriginUrlMustNotMatch.pattern()); + } + } else { + for (Document d: documents) { + d.setDepth(response.depth()); } - // rewrite the url - String u0 = LibraryProvider.urlRewriter.apply(u); - if (!u.equals(u0)) { - log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\""); - u = u0; - } - //Matcher m = rewritePattern.matcher(u); - //if (m.matches()) u = m.replaceAll(""); + // get the hyperlinks + final Map hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow()); - // enqueue the hyperlink into the pre-notice-url db - int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth - try { - this.crawlStacker.enqueueEntry(new Request( - response.initiator(), - new DigestURL(u), - response.url().hash(), - nextEntry.getValue(), - new Date(), - response.profile().handle(), - nextdepth, - response.profile().timezoneOffset())); - } catch (final MalformedURLException e ) { - ConcurrentLog.logException(e); + final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */ + || response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */; + + /* Handle media links */ + + for (Map.Entry entry : Document.getImagelinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getApplinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getVideolinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getAudiolinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + // insert those hyperlinks to the crawler + MultiProtocolURL nextUrl; + for ( final Map.Entry nextEntry : hl.entrySet() ) { + // check for interruption + checkInterruption(); + + // process the next hyperlink + nextUrl = nextEntry.getKey(); + String u = nextUrl.toNormalform(true, true); + if ( !(u.startsWith("http://") + || u.startsWith("https://") + || u.startsWith("ftp://") + || u.startsWith("smb://") || u.startsWith("file://")) ) { + continue; + } + + // rewrite the url + String u0 = LibraryProvider.urlRewriter.apply(u); + if (!u.equals(u0)) { + log.info("REWRITE of url = \"" + u + "\" to \"" + u0 + "\""); + u = u0; + } + //Matcher m = rewritePattern.matcher(u); + //if (m.matches()) u = m.replaceAll(""); + + // enqueue the hyperlink into the pre-notice-url db + int nextdepth = nextEntry.getValue() != null && nextEntry.getValue().equals(Document.CANONICAL_MARKER) ? response.depth() : response.depth() + 1; // canonical documents are on the same depth + try { + this.crawlStacker.enqueueEntry(new Request( + response.initiator(), + new DigestURL(u), + response.url().hash(), + nextEntry.getValue(), + new Date(), + response.profile().handle(), + nextdepth, + response.profile().timezoneOffset())); + } catch (final MalformedURLException e ) { + ConcurrentLog.logException(e); + } } - } - final long stackEndTime = System.currentTimeMillis(); - if ( this.log.isInfo() ) { - this.log.info("CRAWL: ADDED " - + hl.size() - + " LINKS FROM " - + response.url().toNormalform(true) - + ", STACKING TIME = " - + (stackEndTime - stackStartTime) - + ", PARSING TIME = " - + (parsingEndTime - parsingStartTime)); + final long stackEndTime = System.currentTimeMillis(); + if ( this.log.isInfo() ) { + this.log.info("CRAWL: ADDED " + + hl.size() + + " LINKS FROM " + + response.url().toNormalform(true) + + ", STACKING TIME = " + + (stackEndTime - stackStartTime) + + ", PARSING TIME = " + + (parsingEndTime - parsingStartTime)); + } } } return documents;