From 97930a6aad9f5421dd57cb252beb7fe019b1b7be Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Fri, 8 May 2015 13:46:27 +0200
Subject: [PATCH] added must-not-match filter to snapshot generation. also:
 fixed some bugs

---
 htroot/CrawlStartExpert.html                   |  4 +++-
 htroot/CrawlStartExpert.java                   |  9 ++++-----
 htroot/Crawler_p.java                          |  2 ++
 htroot/QuickCrawlLink_p.java                   |  2 +-
 source/net/yacy/crawler/CrawlSwitchboard.java  | 18 +++++++++---------
 source/net/yacy/crawler/data/CrawlProfile.java | 18 ++++++++++++++++++
 source/net/yacy/crawler/data/CrawlQueues.java  |  1 +
 .../net/yacy/data/ymark/YMarkCrawlStart.java   |  2 +-
 source/net/yacy/search/index/Segment.java      |  3 ++-
 9 files changed, 41 insertions(+), 18 deletions(-)
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index 90b83ef59..464a9f907 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -503,11 +503,13 @@
             <input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
           </dd>
+          <dt><label for="<label for="snapshotVersion">must-not-match filter for snapshot generation</label></dt>
+          <dd><input name="snapshotsMustnotmatch" id="snapshotsMustnotmatch" type="text" size="55" maxlength="100000" value="#[snapshotsMustnotmatch]#" /></dd>
           #(snapshotEnableImages)#
           <input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>::
           <dt><label for="snapshotImage">Image Creation</label></dt>
           <dd>
-            <input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage" #(snapshotsLoadImageChecked)#::checked="checked"#(/snapshotsLoadImageChecked)# />
+            <input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage"#(snapshotsLoadImageChecked)#:: checked="checked"#(/snapshotsLoadImageChecked)#/>
           </dd>
 	      #(/snapshotEnableImages)#
 	      </dl>
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 3d8dad06d..79939bd54 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -531,13 +531,12 @@ public class CrawlStartExpert {
         
         // ---------- Snapshot generation
         boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
-        boolean convertAvailable = Html2Image.convertAvailable();
+        //boolean convertAvailable = Html2Image.convertAvailable();
         prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
-        if (sb.getConfigBool("isTransparentProxy", false) &&
-            sb.getConfigBool("proxyAlwaysFresh", false) &&
-             wkhtmltopdfAvailable && convertAvailable) {
+        prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", ""));
+        if (wkhtmltopdfAvailable) {
             prop.put("snapshotEnableImages", 1);
-            prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
+            prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
         } else {
             prop.put("snapshotEnableImages", 0);
         }
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 8b0e39801..92882cd67 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -448,6 +448,7 @@ public class Crawler_p {
                 int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
                 boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
                 boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
+                String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
                 
                 // get vocabulary scraper info
                 JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@@ -501,6 +502,7 @@ public class Crawler_p {
                             snapshotsMaxDepth,
                             snapshotsLoadImage,
                             snapshotsReplaceOld,
+                            snapshotsMustnotmatch,
                             cachePolicy,
                             collection,
                             agentName,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 2b0b599b8..da3e81cd8 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -153,7 +153,7 @@ public class QuickCrawlLink_p {
                         obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                         indexText, indexMedia,
                         storeHTCache, remoteIndexing,
-                        -1, false, true,
+                        -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                         CacheStrategy.IFFRESH,
                         collection,
                         ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index fcce03c4b..6a1c5476b 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
                 sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
                 true,
                 sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFFRESH,
                 "robot_" + CRAWL_PROFILE_PROXY,
                 ClientIdentification.yacyProxyAgentName,
@@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
                 true,
                 false,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFFRESH,
                 "robot_" + CRAWL_PROFILE_REMOTE,
                 ClientIdentification.yacyInternetCrawlerAgentName,
@@ -356,7 +356,7 @@ public final class CrawlSwitchboard {
                 false,
                 true,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFEXIST,
                 "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -388,7 +388,7 @@ public final class CrawlSwitchboard {
                 true,
                 true,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFEXIST,
                 "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -421,7 +421,7 @@ public final class CrawlSwitchboard {
                 false,
                 true,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFEXIST,
                 "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                 ClientIdentification.browserAgentName,
@@ -453,7 +453,7 @@ public final class CrawlSwitchboard {
                 false,
                 true,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFEXIST,
                 "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -485,7 +485,7 @@ public final class CrawlSwitchboard {
                 true,
                 true,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.IFEXIST,
                 "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -517,7 +517,7 @@ public final class CrawlSwitchboard {
                 false,
                 false,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.NOCACHE,
                 "robot_" + CRAWL_PROFILE_SURROGATE,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -552,7 +552,7 @@ public final class CrawlSwitchboard {
                 true,
                 false,
                 false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                 CacheStrategy.NOCACHE,
                 collection,
                 ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index aeafc8b3e..324850ecb 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     public static final String SNAPSHOTS_MAXDEPTH            = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
     public static final String SNAPSHOTS_REPLACEOLD          = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
     public static final String SNAPSHOTS_LOADIMAGE           = "snapshotsLoadImage"; // if true, an image is loaded
+    public static final String SNAPSHOTS_MUSTNOTMATCH        = "snapshotsMustnotmatch";
     
     private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
     private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
     private Pattern crawlernodepthlimitmatch = null;
     private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
     private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
+    private Pattern snapshotsMustnotmatch = null;
 
     private final Map<String, AtomicInteger> doms;
     private final VocabularyScraper scraper;
@@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
      * @param indexMedia true if media content of URL shall be indexed
      * @param storeHTCache true if content chall be kept in cache after indexing
      * @param remoteIndexing true if part of the crawl job shall be distributed
+     * @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated
+     * @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made
+     * @param snapshotsReplaceOld true if snapshots shall not be historized
+     * @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated
      * @param xsstopw true if static stop words shall be ignored
      * @param xdstopw true if dynamic stop words shall be ignored
      * @param xpstopw true if parent stop words shall be ignored
@@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                  final int snapshotsMaxDepth,
                  final boolean snapshotsLoadImage,
                  final boolean snapshotsReplaceOld,
+                 final String snapshotsMustnotmatch,
                  final CacheStrategy cacheStrategy,
                  final String collections,
                  final String userAgentName,
@@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
         put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
         put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
+        put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch);
         put(CACHE_STRAGEGY,   cacheStrategy.toString());
         put(COLLECTIONS,      CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
         // we transform the scraper information into a JSON Array
@@ -628,6 +636,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
         if (r == null) return false;
         return (r.equals(Boolean.TRUE.toString()));
     }
+    
+    public Pattern snapshotsMustnotmatch() {
+        if (this.snapshotsMustnotmatch == null) {
+            final String r = get(SNAPSHOTS_MUSTNOTMATCH);
+            try {
+                this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
+        }
+        return this.snapshotsMustnotmatch;
+    }    
 
     public int timezoneOffset() {
         final String timezoneOffset = get(TIMEZONEOFFSET);
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index 5a9b0c4a1..68b36c013 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -666,6 +666,7 @@ public class CrawlQueues {
                                 if (CrawlQueues.log.isFine()) {
                                     CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
                                 }
+                                e.printStackTrace();
                                 error = "load error - " + e.getMessage();
                             }
    
diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java
index 562a9703f..6eac8bf87 100644
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                crawlingQ,
 		                true, true, true, false,
 		                true, true, false,
-		                -1, false, true,
+		                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
 		                ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index 2c4f82a5c..200bb852d 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -575,7 +575,8 @@ public class Segment {
         
         // CREATE SNAPSHOT
         if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
-                crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) {
+                crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
+                !crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
             // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
             String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
             if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {