added must-not-match filter to snapshot generation.

also: fixed some bugs
pull/8/head
Michael Peter Christen 10 years ago
parent 9d8f426890
commit 97930a6aad

@ -503,11 +503,13 @@
<input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp; <input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl <input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
</dd> </dd>
<dt><label for="<label for="snapshotVersion">must-not-match filter for snapshot generation</label></dt>
<dd><input name="snapshotsMustnotmatch" id="snapshotsMustnotmatch" type="text" size="55" maxlength="100000" value="#[snapshotsMustnotmatch]#" /></dd>
#(snapshotEnableImages)# #(snapshotEnableImages)#
<input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>:: <input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>::
<dt><label for="snapshotImage">Image Creation</label></dt> <dt><label for="snapshotImage">Image Creation</label></dt>
<dd> <dd>
<input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage" #(snapshotsLoadImageChecked)#::checked="checked"#(/snapshotsLoadImageChecked)# /> <input type="checkbox" name="snapshotsLoadImage" id="snapshotsLoadImage"#(snapshotsLoadImageChecked)#:: checked="checked"#(/snapshotsLoadImageChecked)#/>
</dd> </dd>
#(/snapshotEnableImages)# #(/snapshotEnableImages)#
</dl> </dl>

@ -531,13 +531,12 @@ public class CrawlStartExpert {
// ---------- Snapshot generation // ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable(); boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
boolean convertAvailable = Html2Image.convertAvailable(); //boolean convertAvailable = Html2Image.convertAvailable();
prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1")); prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
if (sb.getConfigBool("isTransparentProxy", false) && prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", ""));
sb.getConfigBool("proxyAlwaysFresh", false) && if (wkhtmltopdfAvailable) {
wkhtmltopdfAvailable && convertAvailable) {
prop.put("snapshotEnableImages", 1); prop.put("snapshotEnableImages", 1);
prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0); prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
} else { } else {
prop.put("snapshotEnableImages", 0); prop.put("snapshotEnableImages", 0);
} }

@ -448,6 +448,7 @@ public class Crawler_p {
int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString); int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage"); boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld"); boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
// get vocabulary scraper info // get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@ -501,6 +502,7 @@ public class Crawler_p {
snapshotsMaxDepth, snapshotsMaxDepth,
snapshotsLoadImage, snapshotsLoadImage,
snapshotsReplaceOld, snapshotsReplaceOld,
snapshotsMustnotmatch,
cachePolicy, cachePolicy,
collection, collection,
agentName, agentName,

@ -153,7 +153,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia, indexText, indexMedia,
storeHTCache, remoteIndexing, storeHTCache, remoteIndexing,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,

@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true, true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY, "robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName, ClientIdentification.yacyProxyAgentName,
@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
true, true,
false, false,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE, "robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName, ClientIdentification.yacyInternetCrawlerAgentName,
@ -356,7 +356,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
@ -388,7 +388,7 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
@ -421,7 +421,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT, "robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName, ClientIdentification.browserAgentName,
@ -453,7 +453,7 @@ public final class CrawlSwitchboard {
false, false,
true, true,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
@ -485,7 +485,7 @@ public final class CrawlSwitchboard {
true, true,
true, true,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
@ -517,7 +517,7 @@ public final class CrawlSwitchboard {
false, false,
false, false,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE, "robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,
@ -552,7 +552,7 @@ public final class CrawlSwitchboard {
true, true,
false, false,
false, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE, CacheStrategy.NOCACHE,
collection, collection,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,

@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1 public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded
public static final String SNAPSHOTS_MUSTNOTMATCH = "snapshotsMustnotmatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null; private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms; private final Map<String, AtomicInteger> doms;
private final VocabularyScraper scraper; private final VocabularyScraper scraper;
@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param indexMedia true if media content of URL shall be indexed * @param indexMedia true if media content of URL shall be indexed
* @param storeHTCache true if content chall be kept in cache after indexing * @param storeHTCache true if content chall be kept in cache after indexing
* @param remoteIndexing true if part of the crawl job shall be distributed * @param remoteIndexing true if part of the crawl job shall be distributed
* @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated
* @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made
* @param snapshotsReplaceOld true if snapshots shall not be historized
* @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated
* @param xsstopw true if static stop words shall be ignored * @param xsstopw true if static stop words shall be ignored
* @param xdstopw true if dynamic stop words shall be ignored * @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored * @param xpstopw true if parent stop words shall be ignored
@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final int snapshotsMaxDepth, final int snapshotsMaxDepth,
final boolean snapshotsLoadImage, final boolean snapshotsLoadImage,
final boolean snapshotsReplaceOld, final boolean snapshotsReplaceOld,
final String snapshotsMustnotmatch,
final CacheStrategy cacheStrategy, final CacheStrategy cacheStrategy,
final String collections, final String collections,
final String userAgentName, final String userAgentName,
@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth); put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage); put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld); put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch);
put(CACHE_STRAGEGY, cacheStrategy.toString()); put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll("")); put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array // we transform the scraper information into a JSON Array
@ -629,6 +637,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public Pattern snapshotsMustnotmatch() {
if (this.snapshotsMustnotmatch == null) {
final String r = get(SNAPSHOTS_MUSTNOTMATCH);
try {
this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.snapshotsMustnotmatch;
}
public int timezoneOffset() { public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET); final String timezoneOffset = get(TIMEZONEOFFSET);
if (timezoneOffset == null) return 0; if (timezoneOffset == null) return 0;

@ -666,6 +666,7 @@ public class CrawlQueues {
if (CrawlQueues.log.isFine()) { if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage()); CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
} }
e.printStackTrace();
error = "load error - " + e.getMessage(); error = "load error - " + e.getMessage();
} }

@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ, crawlingQ,
true, true, true, false, true, true, true, false,
true, true, false, true, true, false,
-1, false, true, -1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName, ClientIdentification.yacyIntranetCrawlerAgentName,

@ -575,7 +575,8 @@ public class Segment {
// CREATE SNAPSHOT // CREATE SNAPSHOT
if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) && if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) { crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
!crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results // load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(); String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) { if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {

Loading…
Cancel
Save