added must-not-match filter to snapshot generation.

also: fixed some bugs
pull/8/head
Michael Peter Christen 10 years ago
parent 9d8f426890
commit 97930a6aad

@ -503,6 +503,8 @@
<input type="radio" name="snapshotsReplaceOld" value="on" checked="checked"/> replace old snapshots with new one&nbsp;&nbsp;&nbsp;
<input type="radio" name="snapshotsReplaceOld" value="off" /> add new versions for each crawl
</dd>
<dt><label for="<label for="snapshotVersion">must-not-match filter for snapshot generation</label></dt>
<dd><input name="snapshotsMustnotmatch" id="snapshotsMustnotmatch" type="text" size="55" maxlength="100000" value="#[snapshotsMustnotmatch]#" /></dd>
#(snapshotEnableImages)#
<input type="hidden" name="snapshotsLoadImage" id="snapshotsLoadImage" value="false"/>::
<dt><label for="snapshotImage">Image Creation</label></dt>

@ -531,13 +531,12 @@ public class CrawlStartExpert {
// ---------- Snapshot generation
boolean wkhtmltopdfAvailable = Html2Image.wkhtmltopdfAvailable();
boolean convertAvailable = Html2Image.convertAvailable();
//boolean convertAvailable = Html2Image.convertAvailable();
prop.put("snapshotsMaxDepth", post == null ? "-1" : post.get("snapshotsMaxDepth", "-1"));
if (sb.getConfigBool("isTransparentProxy", false) &&
sb.getConfigBool("proxyAlwaysFresh", false) &&
wkhtmltopdfAvailable && convertAvailable) {
prop.put("snapshotsMustnotmatch", post == null ? "" : post.get("snapshotsMustnotmatch", ""));
if (wkhtmltopdfAvailable) {
prop.put("snapshotEnableImages", 1);
prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 0 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
prop.put("snapshotEnableImages_snapshotsLoadImageChecked", post == null ? 1 : post.getBoolean("snapshotsLoadImage") ? 1 : 0);
} else {
prop.put("snapshotEnableImages", 0);
}

@ -448,6 +448,7 @@ public class Crawler_p {
int snapshotsMaxDepth = Integer.parseInt(snapshotsMaxDepthString);
boolean snapshotsLoadImage = post.getBoolean("snapshotsLoadImage");
boolean snapshotsReplaceOld = post.getBoolean("snapshotsReplaceOld");
String snapshotsMustnotmatch = post.get("snapshotsMustnotmatch", "");
// get vocabulary scraper info
JSONObject vocabulary_scraper = new JSONObject(); // key = vocabulary_name, value = properties with key = type (i.e. 'class') and value = keyword in context
@ -501,6 +502,7 @@ public class Crawler_p {
snapshotsMaxDepth,
snapshotsLoadImage,
snapshotsReplaceOld,
snapshotsMustnotmatch,
cachePolicy,
collection,
agentName,

@ -153,7 +153,7 @@ public class QuickCrawlLink_p {
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,

@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_REMOTE, false),
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY,
ClientIdentification.yacyProxyAgentName,
@ -324,7 +324,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE,
ClientIdentification.yacyInternetCrawlerAgentName,
@ -356,7 +356,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
@ -388,7 +388,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
ClientIdentification.yacyIntranetCrawlerAgentName,
@ -421,7 +421,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
ClientIdentification.browserAgentName,
@ -453,7 +453,7 @@ public final class CrawlSwitchboard {
false,
true,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
@ -485,7 +485,7 @@ public final class CrawlSwitchboard {
true,
true,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,
@ -517,7 +517,7 @@ public final class CrawlSwitchboard {
false,
false,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE,
ClientIdentification.yacyIntranetCrawlerAgentName,
@ -552,7 +552,7 @@ public final class CrawlSwitchboard {
true,
false,
false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.NOCACHE,
collection,
ClientIdentification.yacyIntranetCrawlerAgentName,

@ -94,12 +94,14 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String SNAPSHOTS_MAXDEPTH = "snapshotsMaxDepth"; // if previews shall be loaded, this is positive and denotes the maximum depth; if not this is -1
public static final String SNAPSHOTS_REPLACEOLD = "snapshotsReplaceOld"; // if this is set to true, only one version of a snapshot per day is stored, otherwise we store also different versions per day
public static final String SNAPSHOTS_LOADIMAGE = "snapshotsLoadImage"; // if true, an image is loaded
public static final String SNAPSHOTS_MUSTNOTMATCH = "snapshotsMustnotmatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private Pattern snapshotsMustnotmatch = null;
private final Map<String, AtomicInteger> doms;
private final VocabularyScraper scraper;
@ -127,6 +129,10 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param indexMedia true if media content of URL shall be indexed
* @param storeHTCache true if content chall be kept in cache after indexing
* @param remoteIndexing true if part of the crawl job shall be distributed
* @param snapshotsMaxDepth if the current crawl depth is equal or below that given depth, a snapshot is generated
* @param snapshotsLoadImage true if graphical (== pdf) shapshots shall be made
* @param snapshotsReplaceOld true if snapshots shall not be historized
* @param snapshotsMustnotmatch a regular expression; if it matches on the url, the snapshot is not generated
* @param xsstopw true if static stop words shall be ignored
* @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored
@ -156,6 +162,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final int snapshotsMaxDepth,
final boolean snapshotsLoadImage,
final boolean snapshotsReplaceOld,
final String snapshotsMustnotmatch,
final CacheStrategy cacheStrategy,
final String collections,
final String userAgentName,
@ -196,6 +203,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(SNAPSHOTS_MAXDEPTH, snapshotsMaxDepth);
put(SNAPSHOTS_LOADIMAGE, snapshotsLoadImage);
put(SNAPSHOTS_REPLACEOLD, snapshotsReplaceOld);
put(SNAPSHOTS_MUSTNOTMATCH, snapshotsMustnotmatch);
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, CommonPattern.SPACE.matcher(collections.trim()).replaceAll(""));
// we transform the scraper information into a JSON Array
@ -629,6 +637,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public Pattern snapshotsMustnotmatch() {
if (this.snapshotsMustnotmatch == null) {
final String r = get(SNAPSHOTS_MUSTNOTMATCH);
try {
this.snapshotsMustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (final PatternSyntaxException e) { this.snapshotsMustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.snapshotsMustnotmatch;
}
public int timezoneOffset() {
final String timezoneOffset = get(TIMEZONEOFFSET);
if (timezoneOffset == null) return 0;

@ -666,6 +666,7 @@ public class CrawlQueues {
if (CrawlQueues.log.isFine()) {
CrawlQueues.log.fine("problem loading " + request.url().toString() + ": " + e.getMessage());
}
e.printStackTrace();
error = "load error - " + e.getMessage();
}

@ -186,7 +186,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
crawlingQ,
true, true, true, false,
true, true, false,
-1, false, true,
-1, false, true, CrawlProfile.MATCH_NEVER_STRING,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName,

@ -575,7 +575,8 @@ public class Segment {
// CREATE SNAPSHOT
if ((url.getProtocol().equals("http") || url.getProtocol().equals("https")) &&
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth()) {
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
!crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {

Loading…
Cancel
Save