From 97930a6aad9f5421dd57cb252beb7fe019b1b7be Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 8 May 2015 13:46:27 +0200 Subject: [PATCH] added must-not-match filter to snapshot generation. also: fixed some bugs --- htroot/CrawlStartExpert.html | 4 +++- htroot/CrawlStartExpert.java | 9 ++++----- htroot/Crawler_p.java | 2 ++ htroot/QuickCrawlLink_p.java | 2 +- source/net/yacy/crawler/CrawlSwitchboard.java | 18 +++++++++--------- source/net/yacy/crawler/data/CrawlProfile.java | 18 ++++++++++++++++++ source/net/yacy/crawler/data/CrawlQueues.java | 1 + .../net/yacy/data/ymark/YMarkCrawlStart.java | 2 +- source/net/yacy/search/index/Segment.java | 3 ++- 9 files changed, 41 insertions(+), 18 deletions(-) diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 90b83ef59..464a9f907 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -503,11 +503,13 @@ replace old snapshots with new one    add new versions for each crawl +
+
#(snapshotEnableImages)# ::
- +
#(/snapshotEnableImages)# diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 3d8dad06d..79939bd54 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -531,13 +531,12 @@ public class CrawlStartExpert { // ---------- Snapshot generation boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable(); - boolean convertAvailable = Html2Image.convertAvailable(); + //boolean convertAvailable = Html2Image.convertAvailable(); prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1")); - if (sb.getConfigBool("isTransparentProxy", false) && - sb.getConfigBool("proxyAlwaysFresh", false) && - wkhtmltopdfAvailable && convertAvailable) { + prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", "")); + if (wkhtmltopdfAvailable) { prop.put("snapshotEnableImages", 1); - prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0); + prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0); } else { prop.put("snapshotEnableImages", 0); } diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 8b0e39801..92882cd67 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -448,6 +448,7 @@ public class Crawler_p { int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString); boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage"); boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); + String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", ""); // get vocabulary scraper info JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context @@ -501,6 +502,7 @@ public class Crawler_p { snapshotsMaxDepth, snapshotsLoadImage, snapshotsReplaceOld, + snapshotsMustnotmatch, cachePolicy, collection, agentName, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 2b0b599b8..da3e81cd8 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -153,7 +153,7 @@ public class QuickCrawlLink_p { obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, remoteIndexing, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, collection, ClientIdentification.yacyIntranetCrawlerAgentName, diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index fcce03c4b..6a1c5476b 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -292,7 +292,7 @@ public final class CrawlSwitchboard { sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_PROXY, ClientIdentification.yacyProxyAgentName, @@ -324,7 +324,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CRAWL_PROFILE_REMOTE, ClientIdentification.yacyInternetCrawlerAgentName, @@ -356,7 +356,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, @@ -388,7 +388,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, ClientIdentification.yacyIntranetCrawlerAgentName, @@ -421,7 +421,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, ClientIdentification.browserAgentName, @@ -453,7 +453,7 @@ public final class CrawlSwitchboard { false, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, @@ -485,7 +485,7 @@ public final class CrawlSwitchboard { true, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFEXIST, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, @@ -517,7 +517,7 @@ public final class CrawlSwitchboard { false, false, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, "robot_" + CRAWL_PROFILE_SURROGATE, ClientIdentification.yacyIntranetCrawlerAgentName, @@ -552,7 +552,7 @@ public final class CrawlSwitchboard { true, false, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.NOCACHE, collection, ClientIdentification.yacyIntranetCrawlerAgentName, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index aeafc8b3e..324850ecb 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded + public static final String SNAPSHOTS_MUSTNOTMATCH = "snapshotsMustnotmatch"; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; + private Pattern snapshotsMustnotmatch = null; private final Map doms; private final VocabularyScraper scraper; @@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param indexMedia true if media content of URL shall be indexed * @param storeHTCache true if content chall be kept in cache after indexing * @param remoteIndexing true if part of the crawl job shall be distributed + * @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated + * @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made + * @param snapshotsReplaceOld true if snapshots shall not be historized + * @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated * @param xsstopw true if static stop words shall be ignored * @param xdstopw true if dynamic stop words shall be ignored * @param xpstopw true if parent stop words shall be ignored @@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final int snapshotsMaxDepth, final boolean snapshotsLoadImage, final boolean snapshotsReplaceOld, + final String snapshotsMustnotmatch, final CacheStrategy cacheStrategy, final String collections, final String userAgentName, @@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth); put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage); put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); + put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch); put(CACHE_STRAGEGY, cacheStrategy.toString()); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); // we transform the scraper information into a JSON Array @@ -628,6 +636,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } + + public Pattern snapshotsMustnotmatch() { + if (this.snapshotsMustnotmatch == null) { + final String r = get(SNAPSHOTS_MUSTNOTMATCH); + try { + this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); + } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } + } + return this.snapshotsMustnotmatch; + } public int timezoneOffset() { final String timezoneOffset = get(TIMEZONEOFFSET); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 5a9b0c4a1..68b36c013 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -666,6 +666,7 @@ public class CrawlQueues { if (CrawlQueues.log.isFine()) { CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage()); } + e.printStackTrace(); error = "load error - " + e.getMessage(); } diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 562a9703f..6eac8bf87 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap{ crawlingQ, true, true, true, false, true, true, false, - -1, false, true, + -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 2c4f82a5c..200bb852d 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -575,7 +575,8 @@ public class Segment { // CREATE SNAPSHOT if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) && - crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) { + crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() && + !crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) { // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {