diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
index 9ee5cf28c..8f1be2591 100644
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@@ -30,6 +30,8 @@
#[indexURLMustNotMatch]#
#[indexContentMustMatch]#
#[indexContentMustNotMatch]#
+ #[indexMediaTypeMustMatch]#
+ #[indexMediaTypeMustNotMatch]#
#(status)#terminated::active::system#(/status)#
#{crawlingDomFilterContent}#
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index 0681684ba..56ab1bba7 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -170,12 +170,14 @@
disableIf('ipMustnotmatch', defaultMatchNone);
disableIf('indexmustnotmatch', defaultMatchNone);
disableIf('indexcontentmustnotmatch', defaultMatchNone);
+ disableIf('indexMediaTypeMustNotMatch', defaultMatchNone);
// remove if MATCH_ALL_STRING
disableIf('mustmatch', defaultMatchAll);
disableIf('ipMustmatch', defaultMatchAll);
disableIf('indexmustmatch', defaultMatchAll);
disableIf('indexcontentmustmatch', defaultMatchAll);
+ disableIf('indexMediaTypeMustMatch', defaultMatchAll);
// remove default collection name
disableIf('collection', '#[defaultCollection]#');
@@ -364,6 +366,28 @@
must-not-match
+ Filter on Document Media Type (aka MIME type)
+
+
+
+
+ The filter is a regular expression
+ that must match with the document Media Type (also known as MIME Type) to allow the URL to be indexed.
+ Standard Media Types are described at the IANA registry .
+ Attention: you can test the functionality of your regular expressions using the Regular Expression Tester within YaCy.
+
+
+
+
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 062da6b35..fc5e47af0 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -33,6 +33,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.Html2Image;
import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
import net.yacy.document.LibraryProvider;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@@ -300,6 +301,22 @@ public class CrawlStartExpert {
} else {
prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
}
+
+ // Filter on Media Type of Document: must match
+ if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)) {
+ prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
+ post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+ } else {
+ prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+ }
+
+ // Filter on Media Type of Document: must-not-match
+ if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)) {
+ prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key,
+ post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+ } else {
+ prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
+ }
// ---------- Clean-Up before Crawl Start
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 143add3dc..197f0b0d1 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -52,6 +52,7 @@ import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.FileCrawlStarterTask;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
import net.yacy.crawler.data.NoticedURL.StackType;
import net.yacy.crawler.retrieval.SitemapImporter;
import net.yacy.crawler.robots.RobotsTxt;
@@ -568,6 +569,12 @@ public class Crawler_p {
ignoreclassname,
new VocabularyScraper(vocabulary_scraper),
timezoneOffset);
+
+ profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
+ post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+ profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
+ .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+
handle = ASCII.getBytes(profile.handle());
// before we fire up a new crawl, we make sure that another crawl with the same name is not running
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 392418202..a3ee1bdca 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -53,6 +53,7 @@ import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams;
+import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
/**
@@ -89,6 +90,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M
INDEXING_URL_MUSTNOTMATCH ("indexURLMustNotMatch", false, CrawlAttribute.STRING, "Indexing URL Must-Not-Match Filter"),
INDEXING_CONTENT_MUSTMATCH ("indexContentMustMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Match Filter"),
INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Not-Match Filter"),
+ INDEXING_MEDIA_TYPE_MUSTMATCH("indexMediaTypeMustMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Match Filter"),
+ INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@@ -112,7 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public final String key, label;
public final boolean readonly;
public final int type;
- private CrawlAttribute(String key, final boolean readonly, final int type, final String label) {
+ private CrawlAttribute(final String key, final boolean readonly, final int type, final String label) {
this.key = key;
this.readonly = readonly;
this.type = type;
@@ -131,6 +134,15 @@ public class CrawlProfile extends ConcurrentHashMap implements M
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
+
+ /** Pattern on the media type documents must match before being indexed
+ * @see CollectionSchema#content_type */
+ private Pattern indexMediaTypeMustMatch = null;
+
+ /** Pattern on the media type documents must not match before being indexed
+ * @see CollectionSchema#content_type */
+ private Pattern indexMediaTypeMustNotMatch = null;
+
private Pattern snapshotsMustnotmatch = null;
private final Map doms;
@@ -247,6 +259,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M
assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
put(CrawlAttribute.SCRAPER.key, jsonString);
put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
+ put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+ put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
/**
@@ -539,6 +553,52 @@ public class CrawlProfile extends ConcurrentHashMap implements M
return this.indexcontentmustnotmatch;
}
+ /**
+ * Get the Pattern on media type that documents must match in order to be indexed
+ *
+ * @return a {@link Pattern} instance, defaulting to
+ * {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+ * string is not set or its syntax is incorrect
+ */
+ public Pattern getIndexMediaTypeMustMatchPattern() {
+ if (this.indexMediaTypeMustMatch == null) {
+ /* Cache the compiled pattern for faster next calls */
+ final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
+ try {
+ this.indexMediaTypeMustMatch = (patternStr == null
+ || patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+ : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+ } catch (final PatternSyntaxException e) {
+ this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+ }
+ }
+ return this.indexMediaTypeMustMatch;
+ }
+
+ /**
+ * Get the Pattern on media type that documents must not match in order to be indexed
+ *
+ * @return a {@link Pattern} instance, defaulting to
+ * {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+ * string is not set or its syntax is incorrect
+ */
+ public Pattern getIndexMediaTypeMustNotMatchPattern() {
+ if (this.indexMediaTypeMustNotMatch == null) {
+ /* Cache the compiled pattern for faster next calls */
+ final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
+ try {
+ this.indexMediaTypeMustNotMatch = (patternStr == null
+ || patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+ : Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+ } catch (final PatternSyntaxException e) {
+ this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+ }
+ }
+ return this.indexMediaTypeMustNotMatch;
+ }
+
+
+
/**
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).
@@ -575,7 +635,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public void setCacheStrategy(final CacheStrategy newStrategy) {
put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
}
-
+
/**
* Gets the minimum date that an entry must have to be re-crawled.
* @return time in ms representing a date
@@ -795,6 +855,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
+ prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
+ prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 899b74e59..ba3365d5c 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -378,6 +378,9 @@ dc_rights
return this.publisher == null ? "" : this.publisher;
}
+ /**
+ * @return the Media Type (aka MIME Type) of the document
+ */
public String dc_format() {
return this.mimeType;
}
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 57914e2e3..e0b2df460 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -138,6 +138,7 @@ import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.NoticedURL;
import net.yacy.crawler.data.NoticedURL.StackType;
@@ -3151,6 +3152,28 @@ public final class Switchboard extends serverSwitch {
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
continue docloop;
}
+
+ /* Check document media type (aka MIME type)*/
+ final Pattern mustMatchMediaType = profile.getIndexMediaTypeMustMatchPattern();
+ final Pattern mustNotMatchMediaType = profile.getIndexMediaTypeMustNotMatchPattern();
+ if (!(mustMatchMediaType == CrawlProfile.MATCH_ALL_PATTERN
+ || mustMatchMediaType.matcher(document.dc_format()).matches())
+ || (mustNotMatchMediaType != CrawlProfile.MATCH_NEVER_PATTERN
+ && mustNotMatchMediaType.matcher(document.dc_format()).matches())) {
+ final String failReason = new StringBuilder(
+ "indexing prevented by regular expression on media type; indexContentMustMatchPattern = ")
+ .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH).append(" = ")
+ .append(mustMatchMediaType.pattern()).append(", ")
+ .append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH).append(" = ")
+ .append(mustNotMatchMediaType.pattern()).toString();
+ if (this.log.isInfo()) {
+ this.log.info("Not Condensed Resource '" + urls + " : " + failReason);
+ }
+ // create a new errorURL DB entry
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile,
+ FailCategory.FINAL_PROCESS_CONTEXT, failReason, -1);
+ continue docloop;
+ }
doclist.add(document);
}
diff --git a/source/net/yacy/search/index/ErrorCache.java b/source/net/yacy/search/index/ErrorCache.java
index fedd72f3e..0dbfe0146 100644
--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@@ -88,14 +88,14 @@ public class ErrorCache {
}
/**
- * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200)
- * and caches recently added failed docs (up to maxStackSize = 1000)
+ * Adds an error to the cache of recently added failed docs (up to maxStackSize = 1000)
+ * and eventually (depending on the failCategory) stores an error document to the Solr index (marked as failed by httpstatus_i <> 200)
*
- * @param url failed url
- * @param crawldepth info crawldepth
+ * @param url failed url. Must not be null.
+ * @param crawldepth crawl depth at the time the error occurred.
* @param profile info of collection
- * @param failCategory .store to index otherwise cache only
- * @param anycause info cause-string
+ * @param failCategory .store to index otherwise cache only. Must not be null.
+ * @param anycause info cause-string. Defaults to "unknown" when null.
* @param httpcode http response code
*/
public void push(final DigestURL url, final int crawldepth, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {