diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index 90b83ef59..464a9f907 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -503,11 +503,13 @@
replace old snapshots with new one
add new versions for each crawl
+
+
#(snapshotEnableImages)#
::
-
+
#(/snapshotEnableImages)#
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 3d8dad06d..79939bd54 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -531,13 +531,12 @@ public class CrawlStartExpert {
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
- boolean convertAvailable = Html2Image.convertAvailable();
+ //boolean convertAvailable = Html2Image.convertAvailable();
prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
- if (sb.getConfigBool("isTransparentProxy", false) &&
- sb.getConfigBool("proxyAlwaysFresh", false) &&
- wkhtmltopdfAvailable && convertAvailable) {
+ prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", ""));
+ if (wkhtmltopdfAvailable) {
prop.put("snapshotEnableImages", 1);
- prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
+ prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
} else {
prop.put("snapshotEnableImages", 0);
}
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 8b0e39801..92882cd67 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -448,6 +448,7 @@ public class Crawler_p {
int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
+ String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@@ -501,6 +502,7 @@ public class Crawler_p {
snapshotsMaxDepth,
snapshotsLoadImage,
snapshotsReplaceOld,
+ snapshotsMustnotmatch,
cachePolicy,
collection,
agentName,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 2b0b599b8..da3e81cd8 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -153,7 +153,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index fcce03c4b..6a1c5476b 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
@@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
@@ -356,7 +356,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -388,7 +388,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -421,7 +421,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
@@ -453,7 +453,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -485,7 +485,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -517,7 +517,7 @@ public final class CrawlSwitchboard {
false,
false,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
@@ -552,7 +552,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index aeafc8b3e..324850ecb 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded
+ public static final String SNAPSHOTS_MUSTNOTMATCH = "snapshotsMustnotmatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
+ private Pattern snapshotsMustnotmatch = null;
private final Map doms;
private final VocabularyScraper scraper;
@@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap implements M
* @param indexMedia true if media content of URL shall be indexed
* @param storeHTCache true if content chall be kept in cache after indexing
* @param remoteIndexing true if part of the crawl job shall be distributed
+ * @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated
+ * @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made
+ * @param snapshotsReplaceOld true if snapshots shall not be historized
+ * @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated
* @param xsstopw true if static stop words shall be ignored
* @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored
@@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
final int snapshotsMaxDepth,
final boolean snapshotsLoadImage,
final boolean snapshotsReplaceOld,
+ final String snapshotsMustnotmatch,
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
@@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
+ put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
@@ -628,6 +636,16 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
+
+ public Pattern snapshotsMustnotmatch() {
+ if (this.snapshotsMustnotmatch == null) {
+ final String r = get(SNAPSHOTS_MUSTNOTMATCH);
+ try {
+ this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
+ } catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
+ }
+ return this.snapshotsMustnotmatch;
+ }
public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET);
diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java
index 5a9b0c4a1..68b36c013 100644
--- a/source/net/yacy/crawler/data/CrawlQueues.java
+++ b/source/net/yacy/crawler/data/CrawlQueues.java
@@ -666,6 +666,7 @@ public class CrawlQueues {
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
}
+ e.printStackTrace();
error = "load error - " + e.getMessage();
}
diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java
index 562a9703f..6eac8bf87 100644
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap{
crawlingQ,
true, true, true, false,
true, true, false,
- -1, false, true,
+ -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index 2c4f82a5c..200bb852d 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -575,7 +575,8 @@ public class Segment {
// CREATE SNAPSHOT
if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
- crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) {
+ crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
+ !crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {