Added a crawl filtering possibility on documents Media Type (MIME)

7 years ago · fb3032c530
parent 90d4802082
commit fb3032c530
8 changed files with 146 additions and 8 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -30,6 +30,8 @@
 		<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
 		<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
 		<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
+		<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
+		<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
 		<status>#(status)#terminated::active::system#(/status)#</status>
 		<crawlingDomFilterContent>
 		#{crawlingDomFilterContent}#
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -170,12 +170,14 @@
                disableIf('ipMustnotmatch', defaultMatchNone);
                disableIf('indexmustnotmatch', defaultMatchNone);
                disableIf('indexcontentmustnotmatch', defaultMatchNone);
+                disableIf('indexMediaTypeMustNotMatch', defaultMatchNone);

                // remove if MATCH_ALL_STRING
                disableIf('mustmatch', defaultMatchAll);
                disableIf('ipMustmatch', defaultMatchAll);
                disableIf('indexmustmatch', defaultMatchAll);
                disableIf('indexcontentmustmatch', defaultMatchAll);
+                disableIf('indexMediaTypeMustMatch', defaultMatchAll);

                // remove default collection name
                disableIf('collection', '#[defaultCollection]#');
@ -364,6 +366,28 @@
 		    <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexcontentmustnotmatch" id="indexcontentmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexcontentmustnotmatch]#" /></td></tr>
 			</table>
 	        </dd>
+ 	        <dt>Filter on Document Media Type (aka MIME type)</dt>
+	        <dd>
+            	<div class="info" style="float:right">
+            		<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
+            		<span style="right:0px;" id="mediaTypeMustMatchInfo">
+            			The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
+            			that <b>must match</b> with the document Media Type (also known as MIME Type) to allow the URL to be indexed. 
+            			Standard Media Types are described at the <a href="https://www.iana.org/assignments/media-types/media-types.xhtml" target="_blank">IANA registry</a>.
+            			Attention: you can test the functionality of your regular expressions using the <a href="RegexTest.html">Regular Expression Tester</a> within YaCy.
+            		</span>
+            	</div>
+            	<table style="border-width: 0px">
+		    		<tr>
+		    			<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
+		    			<td><input name="indexMediaTypeMustMatch" id="indexMediaTypeMustMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustMatch]#" aria-describedby="mediaTypeMustMatchInfo" /></td>
+		    		</tr>
+		    		<tr>
+		    			<td><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
+		    			<td><input name="indexMediaTypeMustNotMatch" id="indexMediaTypeMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexMediaTypeMustNotMatch]#" /></td>
+		    		</tr>
+				</table>
+	        </dd>
 	      </dl>
        </fieldset>
        <fieldset>
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@ -33,6 +33,7 @@ import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.util.Html2Image;
 import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
 import net.yacy.document.LibraryProvider;
 import net.yacy.search.Switchboard;
 import net.yacy.search.SwitchboardConstants;
@ -300,6 +301,22 @@ public class CrawlStartExpert {
        } else {
            prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        }
+        
+		// Filter on Media Type of Document: must match
+		if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)) {
+			prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
+					post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+		} else {
+			prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+		}
+
+		// Filter on Media Type of Document: must-not-match
+		if (post != null && post.containsKey(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)) {
+			prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key,
+					post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+		} else {
+			prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
+		}


        // ---------- Clean-Up before Crawl Start
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -52,6 +52,7 @@ import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.crawler.FileCrawlStarterTask;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
 import net.yacy.crawler.data.NoticedURL.StackType;
 import net.yacy.crawler.retrieval.SitemapImporter;
 import net.yacy.crawler.robots.RobotsTxt;
@ -568,6 +569,12 @@ public class Crawler_p {
                            ignoreclassname,
                            new VocabularyScraper(vocabulary_scraper),
                            timezoneOffset);
+                    
+					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key,
+							post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
+					profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
+							.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
+                    
                    handle = ASCII.getBytes(profile.handle());

                    // before we fire up a new crawl, we make sure that another crawl with the same name is not running
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -53,6 +53,7 @@ import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.search.query.QueryParams;
+import net.yacy.search.schema.CollectionSchema;
 import net.yacy.server.serverObjects;

 /**
@ -89,6 +90,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEXING_URL_MUSTNOTMATCH    ("indexURLMustNotMatch",       false, CrawlAttribute.STRING,  "Indexing URL Must-Not-Match Filter"),
        INDEXING_CONTENT_MUSTMATCH   ("indexContentMustMatch",      false, CrawlAttribute.STRING,  "Indexing Content Must-Match Filter"),
        INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch",   false, CrawlAttribute.STRING,  "Indexing Content Must-Not-Match Filter"),
+        INDEXING_MEDIA_TYPE_MUSTMATCH("indexMediaTypeMustMatch",    false, CrawlAttribute.STRING,  "Indexing Media Type (MIME) Must-Match Filter"),
+        INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
        RECRAWL_IF_OLDER             ("recrawlIfOlder",             false, CrawlAttribute.INTEGER, "Recrawl If Older"),
        STORE_HTCACHE                ("storeHTCache",               false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
        CACHE_STRAGEGY               ("cacheStrategy",              false, CrawlAttribute.STRING,  "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -112,7 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        public final String key, label;
        public final boolean readonly;
        public final int type;
-        private CrawlAttribute(String key, final boolean readonly, final int type, final String label) {
+        private CrawlAttribute(final String key, final boolean readonly, final int type, final String label) {
            this.key = key;
            this.readonly = readonly;
            this.type = type;
@ -131,6 +134,15 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    private Pattern crawlernodepthlimitmatch = null;
    private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
    private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
+    
+    /** Pattern on the media type documents must match before being indexed 
+     * @see CollectionSchema#content_type */
+    private Pattern indexMediaTypeMustMatch = null;
+    
+    /** Pattern on the media type documents must not match before being indexed
+     * @see CollectionSchema#content_type  */
+    private Pattern indexMediaTypeMustNotMatch = null;
+    
    private Pattern snapshotsMustnotmatch = null;

    private final Map<String, AtomicInteger> doms;
@ -247,6 +259,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        assert jsonString != null && jsonString.length() > 0 && jsonString.charAt(0) == '{' : "jsonString = " + jsonString;
        put(CrawlAttribute.SCRAPER.key, jsonString);
        put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
+        put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
+        put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
    }

    /**
@ -539,6 +553,52 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return this.indexcontentmustnotmatch;
    }
    
+	/**
+	 * Get the Pattern on media type that documents must match in order to be indexed
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_ALL_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getIndexMediaTypeMustMatchPattern() {
+		if (this.indexMediaTypeMustMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key);
+			try {
+				this.indexMediaTypeMustMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.indexMediaTypeMustMatch = CrawlProfile.MATCH_ALL_PATTERN;
+			}
+		}
+        return this.indexMediaTypeMustMatch;
+    }
+    
+	/**
+	 * Get the Pattern on media type that documents must not match in order to be indexed
+	 * 
+	 * @return a {@link Pattern} instance, defaulting to
+	 *         {@link CrawlProfile#MATCH_NEVER_PATTERN} when the regular expression
+	 *         string is not set or its syntax is incorrect
+	 */
+    public Pattern getIndexMediaTypeMustNotMatchPattern() {
+		if (this.indexMediaTypeMustNotMatch == null) {
+			/* Cache the compiled pattern for faster next calls */
+			final String patternStr = get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key);
+			try {
+				this.indexMediaTypeMustNotMatch = (patternStr == null
+						|| patternStr.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN
+								: Pattern.compile(patternStr, Pattern.CASE_INSENSITIVE);
+			} catch (final PatternSyntaxException e) {
+				this.indexMediaTypeMustNotMatch = CrawlProfile.MATCH_NEVER_PATTERN;
+			}
+		}
+        return this.indexMediaTypeMustNotMatch;
+    }
+    
+    
+    
    /**
     * Gets depth of crawl job (or height of the tree which will be
     * created by the crawler).
@ -575,7 +635,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public void setCacheStrategy(final CacheStrategy newStrategy) {
        put(CrawlAttribute.CACHE_STRAGEGY.key, newStrategy.toString());
    }
-
+    
    /**
     * Gets the minimum date that an entry must have to be re-crawled.
     * @return time in ms representing a date
@ -795,6 +855,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
+        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
        //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
        //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
        //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -378,6 +378,9 @@ dc_rights
        return this.publisher == null ? "" : this.publisher;
    }

+    /**
+     * @return the Media Type (aka MIME Type) of the document
+     */
    public String dc_format() {
        return this.mimeType;
    }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -138,6 +138,7 @@ import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.crawler.HarvestProcess;
 import net.yacy.crawler.data.Cache;
 import net.yacy.crawler.data.CrawlProfile;
+import net.yacy.crawler.data.CrawlProfile.CrawlAttribute;
 import net.yacy.crawler.data.CrawlQueues;
 import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.data.NoticedURL.StackType;
@ -3151,6 +3152,28 @@ public final class Switchboard extends serverSwitch {
                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
                continue docloop;
            }
+            
+            /* Check document media type (aka MIME type)*/
+            final Pattern mustMatchMediaType = profile.getIndexMediaTypeMustMatchPattern();
+            final Pattern mustNotMatchMediaType = profile.getIndexMediaTypeMustNotMatchPattern();
+			if (!(mustMatchMediaType == CrawlProfile.MATCH_ALL_PATTERN
+					|| mustMatchMediaType.matcher(document.dc_format()).matches())
+					|| (mustNotMatchMediaType != CrawlProfile.MATCH_NEVER_PATTERN
+							&& mustNotMatchMediaType.matcher(document.dc_format()).matches())) {
+				final String failReason = new StringBuilder(
+						"indexing prevented by regular expression on media type; indexContentMustMatchPattern = ")
+								.append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH).append(" = ")
+								.append(mustMatchMediaType.pattern()).append(", ")
+								.append(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH).append(" = ")
+								.append(mustNotMatchMediaType.pattern()).toString();
+				if (this.log.isInfo()) {
+					this.log.info("Not Condensed Resource '" + urls + " : " + failReason);
+				}
+				// create a new errorURL DB entry
+				this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile,
+						FailCategory.FINAL_PROCESS_CONTEXT, failReason, -1);
+				continue docloop;
+			}
            doclist.add(document);
        }

--- a/source/net/yacy/search/index/ErrorCache.java
+++ b/source/net/yacy/search/index/ErrorCache.java
@ -88,14 +88,14 @@ public class ErrorCache {
    }

    /**
-     * Adds a error document to the Solr index (marked as failed by httpstatus_i <> 200)
-     * and caches recently added failed docs (up to maxStackSize = 1000)
+     * Adds an error to the cache of recently added failed docs (up to maxStackSize = 1000)
+     * and eventually (depending on the failCategory) stores an error document to the Solr index (marked as failed by httpstatus_i <> 200) 
     *
-     * @param url  failed url
-     * @param crawldepth info crawldepth
+     * @param url failed url. Must not be null.
+     * @param crawldepth crawl depth at the time the error occurred.
     * @param profile info of collection
-     * @param failCategory .store to index otherwise cache only
-     * @param anycause info cause-string
+     * @param failCategory .store to index otherwise cache only. Must not be null.
+     * @param anycause info cause-string. Defaults to "unknown" when null.
     * @param httpcode http response code
     */
    public void push(final DigestURL url, final int crawldepth, final CrawlProfile profile, final FailCategory failCategory, String anycause, final int httpcode) {