added must-not-match filter to snapshot generation.

also: fixed some bugs
10 years ago · 97930a6aad
parent 9d8f426890
commit 97930a6aad
9 changed files with 41 additions and 18 deletions
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -503,6 +503,8 @@
            <input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
 			<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
          </dd>
+          <dt><label for="<label for="snapshotVersion">must-not-match filter for snapshot generation</label></dt>
+          <dd><input name="snapshotsMustnotmatch" id="snapshotsMustnotmatch" type="text" size="55" maxlength="100000" value="#[snapshotsMustnotmatch]#" /></dd>
          #(snapshotEnableImages)#
          <input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>::
          <dt><label for="snapshotImage">Image Creation</label></dt>
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -531,13 +531,12 @@ public class CrawlStartExpert {
        
        // ---------- Snapshot generation
        boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
-        boolean convertAvailable = Html2Image.convertAvailable();
+        //boolean convertAvailable = Html2Image.convertAvailable();
        prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
-        if (sb.getConfigBool("isTransparentProxy", false) &&
-            sb.getConfigBool("proxyAlwaysFresh", false) &&
-             wkhtmltopdfAvailable && convertAvailable) {
+        prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", ""));
+        if (wkhtmltopdfAvailable) {
            prop.put("snapshotEnableImages", 1);
-            prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
+            prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
        } else {
            prop.put("snapshotEnableImages", 0);
        }
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -448,6 +448,7 @@ public class Crawler_p {
                int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
                boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
                boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
+                String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
                
                // get vocabulary scraper info
                JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@ -501,6 +502,7 @@ public class Crawler_p {
                            snapshotsMaxDepth,
                            snapshotsLoadImage,
                            snapshotsReplaceOld,
+                            snapshotsMustnotmatch,
                            cachePolicy,
                            collection,
                            agentName,
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -153,7 +153,7 @@ public class QuickCrawlLink_p {
                        obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                        indexText, indexMedia,
                        storeHTCache, remoteIndexing,
-                        -1, false, true,
+                        -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                        CacheStrategy.IFFRESH,
                        collection,
                        ClientIdentification.yacyIntranetCrawlerAgentName,
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
                true,
                sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_PROXY,
                ClientIdentification.yacyProxyAgentName,
@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
                true,
                false,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFFRESH,
                "robot_" + CRAWL_PROFILE_REMOTE,
                ClientIdentification.yacyInternetCrawlerAgentName,
@ -356,7 +356,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
@ -388,7 +388,7 @@ public final class CrawlSwitchboard {
                true,
                true,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
                ClientIdentification.yacyIntranetCrawlerAgentName,
@ -421,7 +421,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
                ClientIdentification.browserAgentName,
@ -453,7 +453,7 @@ public final class CrawlSwitchboard {
                false,
                true,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
@ -485,7 +485,7 @@ public final class CrawlSwitchboard {
                true,
                true,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.IFEXIST,
                "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
                ClientIdentification.yacyIntranetCrawlerAgentName,
@ -517,7 +517,7 @@ public final class CrawlSwitchboard {
                false,
                false,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.NOCACHE,
                "robot_" + CRAWL_PROFILE_SURROGATE,
                ClientIdentification.yacyIntranetCrawlerAgentName,
@ -552,7 +552,7 @@ public final class CrawlSwitchboard {
                true,
                false,
                false,
-                -1, false, true,
+                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
                CacheStrategy.NOCACHE,
                collection,
                ClientIdentification.yacyIntranetCrawlerAgentName,
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String SNAPSHOTS_MAXDEPTH            = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
    public static final String SNAPSHOTS_REPLACEOLD          = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
    public static final String SNAPSHOTS_LOADIMAGE           = "snapshotsLoadImage"; // if true, an image is loaded
+    public static final String SNAPSHOTS_MUSTNOTMATCH        = "snapshotsMustnotmatch";
    
    private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
    private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
    private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
+    private Pattern snapshotsMustnotmatch = null;

    private final Map<String, AtomicInteger> doms;
    private final VocabularyScraper scraper;
@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
     * @param indexMedia true if media content of URL shall be indexed
     * @param storeHTCache true if content chall be kept in cache after indexing
     * @param remoteIndexing true if part of the crawl job shall be distributed
+     * @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated
+     * @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made
+     * @param snapshotsReplaceOld true if snapshots shall not be historized
+     * @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated
     * @param xsstopw true if static stop words shall be ignored
     * @param xdstopw true if dynamic stop words shall be ignored
     * @param xpstopw true if parent stop words shall be ignored
@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final int snapshotsMaxDepth,
                 final boolean snapshotsLoadImage,
                 final boolean snapshotsReplaceOld,
+                 final String snapshotsMustnotmatch,
                 final CacheStrategy cacheStrategy,
                 final String collections,
                 final String userAgentName,
@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
        put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
        put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
+        put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch);
        put(CACHE_STRAGEGY,   cacheStrategy.toString());
        put(COLLECTIONS,      CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
        // we transform the scraper information into a JSON Array
@ -629,6 +637,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }
    
+    public Pattern snapshotsMustnotmatch() {
+        if (this.snapshotsMustnotmatch == null) {
+            final String r = get(SNAPSHOTS_MUSTNOTMATCH);
+            try {
+                this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
+            } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
+        }
+        return this.snapshotsMustnotmatch;
+    }    
+
    public int timezoneOffset() {
        final String timezoneOffset = get(TIMEZONEOFFSET);
        if (timezoneOffset == null) return 0;
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@ -666,6 +666,7 @@ public class CrawlQueues {
                                if (CrawlQueues.log.isFine()) {
                                    CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
                                }
+                                e.printStackTrace();
                                error = "load error - " + e.getMessage();
                            }
   
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                crawlingQ,
 		                true, true, true, false,
 		                true, true, false,
-		                -1, false, true,
+		                -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
 		                ClientIdentification.yacyIntranetCrawlerAgentName,
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -575,7 +575,8 @@ public class Segment {
        
        // CREATE SNAPSHOT
        if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
-                crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) {
+                crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
+                !crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
            // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
            String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
            if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {