diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 9ee5cf28c..8f1be2591 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -30,6 +30,8 @@ #[indexURLMustNotMatch]# #[indexContentMustMatch]# #[indexContentMustNotMatch]# + #[indexMediaTypeMustMatch]# + #[indexMediaTypeMustNotMatch]# #(status)#terminated::active::system#(/status)# #{crawlingDomFilterContent}# diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 0681684ba..56ab1bba7 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -170,12 +170,14 @@ disableIf('ipMustnotmatch', defaultMatchNone); disableIf('indexmustnotmatch', defaultMatchNone); disableIf('indexcontentmustnotmatch', defaultMatchNone); + disableIf('indexMediaTypeMustNotMatch', defaultMatchNone); // remove if MATCH_ALL_STRING disableIf('mustmatch', defaultMatchAll); disableIf('ipMustmatch', defaultMatchAll); disableIf('indexmustmatch', defaultMatchAll); disableIf('indexcontentmustmatch', defaultMatchAll); + disableIf('indexMediaTypeMustMatch', defaultMatchAll); // remove default collection name disableIf('collection', '#[defaultCollection]#'); @@ -364,6 +366,28 @@ must-not-match +
Filter on Document Media Type (aka MIME type)
+
+
+ Clean up search events cache info + + The filter is a regular expression + that must match with the document Media Type (also known as MIME Type) to allow the URL to be indexed. + Standard Media Types are described at the IANA registry. + Attention: you can test the functionality of your regular expressions using the Regular Expression Tester within YaCy. + +
+ + + + + + + + + +
must-match
must-not-match
+
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 062da6b35..fc5e47af0 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -33,6 +33,7 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.Html2Image; import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; import net.yacy.document.LibraryProvider; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; @@ -300,6 +301,22 @@ public class CrawlStartExpert { } else { prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); } + + // Filter on Media Type of Document: must match + if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)) { + prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, + post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + } else { + prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING); + } + + // Filter on Media Type of Document: must-not-match + if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)) { + prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, + post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + } else { + prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); + } // ---------- Clean-Up before Crawl Start diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 143add3dc..197f0b0d1 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -52,6 +52,7 @@ import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.FileCrawlStarterTask; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.retrieval.SitemapImporter; import net.yacy.crawler.robots.RobotsTxt; @@ -568,6 +569,12 @@ public class Crawler_p { ignoreclassname, new VocabularyScraper(vocabulary_scraper), timezoneOffset); + + profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, + post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); + profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post + .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + handle = ASCII.getBytes(profile.handle()); // before we fire up a new crawl, we make sure that another crawl with the same name is not running diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 392418202..a3ee1bdca 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -53,6 +53,7 @@ import net.yacy.crawler.CrawlSwitchboard; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.data.word.Word; import net.yacy.search.query.QueryParams; +import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; /** @@ -89,6 +90,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M INDEXING_URL_MUSTNOTMATCH ("indexURLMustNotMatch", false, CrawlAttribute.STRING, "Indexing URL Must-Not-Match Filter"), INDEXING_CONTENT_MUSTMATCH ("indexContentMustMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Match Filter"), INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Not-Match Filter"), + INDEXING_MEDIA_TYPE_MUSTMATCH("indexMediaTypeMustMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Match Filter"), + INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"), RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"), STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"), CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"), @@ -112,7 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public final String key, label; public final boolean readonly; public final int type; - private CrawlAttribute(String key, final boolean readonly, final int type, final String label) { + private CrawlAttribute(final String key, final boolean readonly, final int type, final String label) { this.key = key; this.readonly = readonly; this.type = type; @@ -131,6 +134,15 @@ public class CrawlProfile extends ConcurrentHashMap implements M private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; + + /** Pattern on the media type documents must match before being indexed + * @see CollectionSchema#content_type */ + private Pattern indexMediaTypeMustMatch = null; + + /** Pattern on the media type documents must not match before being indexed + * @see CollectionSchema#content_type */ + private Pattern indexMediaTypeMustNotMatch = null; + private Pattern snapshotsMustnotmatch = null; private final Map doms; @@ -247,6 +259,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString; put(CrawlAttribute.SCRAPER.key, jsonString); put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset); + put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING); + put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); } /** @@ -539,6 +553,52 @@ public class CrawlProfile extends ConcurrentHashMap implements M return this.indexcontentmustnotmatch; } + /** + * Get the Pattern on media type that documents must match in order to be indexed + * + * @return a {@link Pattern} instance, defaulting to + * {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression + * string is not set or its syntax is incorrect + */ + public Pattern getIndexMediaTypeMustMatchPattern() { + if (this.indexMediaTypeMustMatch == null) { + /* Cache the compiled pattern for faster next calls */ + final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key); + try { + this.indexMediaTypeMustMatch = (patternStr == null + || patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN + : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); + } catch (final PatternSyntaxException e) { + this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN; + } + } + return this.indexMediaTypeMustMatch; + } + + /** + * Get the Pattern on media type that documents must not match in order to be indexed + * + * @return a {@link Pattern} instance, defaulting to + * {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression + * string is not set or its syntax is incorrect + */ + public Pattern getIndexMediaTypeMustNotMatchPattern() { + if (this.indexMediaTypeMustNotMatch == null) { + /* Cache the compiled pattern for faster next calls */ + final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key); + try { + this.indexMediaTypeMustNotMatch = (patternStr == null + || patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN + : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE); + } catch (final PatternSyntaxException e) { + this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN; + } + } + return this.indexMediaTypeMustNotMatch; + } + + + /** * Gets depth of crawl job (or height of the tree which will be * created by the crawler). @@ -575,7 +635,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public void setCacheStrategy(final CacheStrategy newStrategy) { put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString()); } - + /** * Gets the minimum date that an entry must have to be re-crawled. * @return time in ms representing a date @@ -795,6 +855,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)); //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 899b74e59..ba3365d5c 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -378,6 +378,9 @@ dc_rights return this.publisher == null ? "" : this.publisher; } + /** + * @return the Media Type (aka MIME Type) of the document + */ public String dc_format() { return this.mimeType; } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 57914e2e3..e0b2df460 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -138,6 +138,7 @@ import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; +import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.NoticedURL; import net.yacy.crawler.data.NoticedURL.StackType; @@ -3151,6 +3152,28 @@ public final class Switchboard extends serverSwitch { this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); continue docloop; } + + /* Check document media type (aka MIME type)*/ + final Pattern mustMatchMediaType = profile.getIndexMediaTypeMustMatchPattern(); + final Pattern mustNotMatchMediaType = profile.getIndexMediaTypeMustNotMatchPattern(); + if (!(mustMatchMediaType == CrawlProfile.MATCH_ALL_PATTERN + || mustMatchMediaType.matcher(document.dc_format()).matches()) + || (mustNotMatchMediaType != CrawlProfile.MATCH_NEVER_PATTERN + && mustNotMatchMediaType.matcher(document.dc_format()).matches())) { + final String failReason = new StringBuilder( + "indexing prevented by regular expression on media type; indexContentMustMatchPattern = ") + .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH).append(" = ") + .append(mustMatchMediaType.pattern()).append(", ") + .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH).append(" = ") + .append(mustNotMatchMediaType.pattern()).toString(); + if (this.log.isInfo()) { + this.log.info("Not Condensed Resource '" + urls + " : " + failReason); + } + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, + FailCategory.FINAL_PROCESS_CONTEXT, failReason, -1); + continue docloop; + } doclist.add(document); } diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java index fedd72f3e..0dbfe0146 100644 --- a/source/net/yacy/search/index/ErrorCache.java +++ b/source/net/yacy/search/index/ErrorCache.java @@ -88,14 +88,14 @@ public class ErrorCache { } /** - * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200) - * and caches recently added failed docs (up to maxStackSize = 1000) + * Adds an error to the cache of recently added failed docs (up to maxStackSize = 1000) + * and eventually (depending on the failCategory) stores an error document to the Solr index (marked as failed by httpstatus_i <> 200) * - * @param url failed url - * @param crawldepth info crawldepth + * @param url failed url. Must not be null. + * @param crawldepth crawl depth at the time the error occurred. * @param profile info of collection - * @param failCategory .store to index otherwise cache only - * @param anycause info cause-string + * @param failCategory .store to index otherwise cache only. Must not be null. + * @param anycause info cause-string. Defaults to "unknown" when null. * @param httpcode http response code */ public void push(final DigestURL url, final int crawldepth, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {