diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 01271eb45..4ab2b2534 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -20,6 +20,7 @@ #(storeHTCache)#false::true#(/storeHTCache)# #(remoteIndexing)#false::true#(/remoteIndexing)# #[cacheStrategy]# + #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)# #[crawlerURLMustMatch]# #[crawlerURLMustNotMatch]# #[crawlerIPMustMatch]# diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 37cf61224..909b45680 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -317,6 +317,27 @@ Obey html-robots-nofollow: +
Media Type detection
+
+
+ Media Type checking info + + Not loading URLs with unsupported file extension is faster but less accurate. + Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples: + + +
+ + +
Load Filter on URLs
info The filter is a regular expression. diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 6f463218c..54cc6b234 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -213,6 +213,13 @@ public class CrawlStartExpert { prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0); prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0); } + + // always cross-check URL file extension against actual Media Type ? + if (post == null) { + prop.put("crawlerAlwaysCheckMediaType", true); + } else { + prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType")); + } // Load Filter on URLs (range) if (post != null && post.containsKey("range")) { diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 21c3c883b..343ecfed5 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -332,7 +332,7 @@ public class Crawler_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; - boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them + boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available env.setConfig("crawlingDirectDocByURL", directDocByURL); final String collection = post.get("collection", "user"); @@ -633,6 +633,8 @@ public class Crawler_p { .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch); profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch); + profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, + post.getBoolean("crawlerAlwaysCheckMediaType")); handle = ASCII.getBytes(profile.handle()); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 97d35e005..9f44933c6 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask{ return error; } - // check availability of parser and maxfilesize String warning = null; - //ContentDomain contentDomain = entry.url().getContentDomainFromExt(); - if (TextParser.supportsExtension(entry.url()) != null) { - warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots); - //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning); - return null; + if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) { + if(profile.isIndexNonParseableUrls()) { + /* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */ + warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots); + if (warning != null && CrawlStacker.log.isFine()) { + CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning); + } + return null; + } + + error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled."; + CrawlStacker.log.info(error); + return error; } if (global) { diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 488c24de4..ded1b764b 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M FOLLOW_FRAMES ("followFrames", false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"), OBEY_HTML_ROBOTS_NOINDEX ("obeyHtmlRobotsNoindex", false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"), OBEY_HTML_ROBOTS_NOFOLLOW ("obeyHtmlRobotsNofollow", false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"), + CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"), CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"), CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"), CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"), @@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CrawlAttribute.HANDLE.key, handle); put(CrawlAttribute.NAME.key, name); put(CrawlAttribute.AGENT_NAME.key, userAgentName); + put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true); put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch); put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch); put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch); @@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap implements M } } - public boolean directDocByURL() { + /** + * @return true when URLs of unsupported resources (no parser available or denied format) should + * be indexed as links (with metadata only on URL and not on content). + */ + public boolean isIndexNonParseableUrls() { final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key); if (r == null) return false; return (r.equals(Boolean.TRUE.toString())); } + + /** + * @return true when the crawler must always cross check the eventual URL file + * extension against the actual Media Type, even when file extension is + * unknown or unsupported. False when the crawler should not load URLs + * with an unknown or unsupported file extension. + */ + public boolean isCrawlerAlwaysCheckMediaType() { + final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key); + if (r == null) { + return false; + } + return (r.equals(Boolean.TRUE.toString())); + } public CacheStrategy cacheStrategy() { final String r = get(CrawlAttribute.CACHE_STRAGEGY.key); @@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent); prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); - prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0); + prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString())); prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages()); //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages' @@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0); prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key)); + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType()); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key)); diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index 35a1b5c15..32248b748 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -55,10 +55,18 @@ public class NoticedURL { LOCAL, GLOBAL, REMOTE, NOLOAD; } - private Balancer coreStack; // links found by crawling to depth-1 - private Balancer limitStack; // links found by crawling at target depth - private Balancer remoteStack; // links from remote crawl orders (init on demand) - private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry + /** links found by crawling to depth-1 */ + private Balancer coreStack; + + /** links found by crawling at target depth */ + private Balancer limitStack; + + /** links from remote crawl orders (init on demand) */ + private Balancer remoteStack; + + /** links that are not passed to a loader; the index will be generated from the Request entry */ + private Balancer noloadStack; + private final File cachePath; protected NoticedURL( diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 5a11aa24b..078b2f0f4 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -742,8 +742,12 @@ public class Response { // -ranges in request // we checked that in shallStoreCache - // check if document can be indexed - if (this.responseHeader != null) { + /* + * Eventually check if a parser supports the media yype. Depending on the crawl + * profile, the indexingDocumentProcessor can eventually index only URL metadata + * using the generic parser for unsupported media types + */ + if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) { final String mimeType = this.responseHeader.getContentType(); final String parserError = TextParser.supportsMime(mimeType); if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 3ee564da2..de1486392 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -241,6 +241,29 @@ public final class TextParser { return docs; } + /** + * Apply only the generic parser to the given content from location. + */ + public static Document[] genericParseSource( + final DigestURL location, + String mimeType, + final String charset, + final Set ignoreClassNames, + final VocabularyScraper scraper, + final int timezoneOffset, + final int depth, + final byte[] content + ) throws Parser.Failure { + if (AbstractParser.log.isFine()) { + AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser"); + } + mimeType = normalizeMimeType(mimeType); + Set idioms = new HashSet<>(); + idioms.add(TextParser.genericIdiom); + + return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); + } + private static Document[] parseSource( final DigestURL location, String mimeType, @@ -644,7 +667,7 @@ public final class TextParser { * @param url the given url * @param mimeType the given mime type * @return a list of Idiom parsers that may be appropriate for the given criteria - * @throws Parser.Failure + * @throws Parser.Failure when the file extension or the MIME type is denied */ private static Set parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { final Set idioms = new LinkedHashSet(2); // LinkedSet to maintain order (genericParser should be last) @@ -661,7 +684,12 @@ public final class TextParser { // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied) String ext = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext != null && ext.length() > 0) { - if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); + /* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown). + * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html). + * Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */ + if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) { + throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); + } idiom = ext2parser.get(ext); if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser idioms.addAll(idiom); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 19b6223cf..bbe96ee7c 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch { noIndexReason = response.shallIndexCacheForCrawler(); } - // check if the parser supports the mime type - if ( noIndexReason == null ) { + /* + * Eventually check if a parser supports the media type. Depending on the crawl + * profile, the indexingDocumentProcessor can eventually index only URL metadata + * using the generic parser for unsupported media types + */ + if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) { noIndexReason = TextParser.supports(response.url(), response.getMimeType()); } @@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch { } } assert response.getContent() != null; + try { - // parse the document - documents = - TextParser.parseSource( - new AnchorURL(response.url()), - response.getMimeType(), - response.getCharacterEncoding(), - response.profile().ignoreDivClassName(), - response.profile().scraper(), - response.profile().timezoneOffset(), - response.depth(), - response.getContent()); + final String supportError = TextParser.supports(response.url(), response.getMimeType()); + if (supportError != null) { + /* No parser available or format is denied */ + if(response.profile().isIndexNonParseableUrls()) { + /* Apply the generic parser add the URL as a simple link (no content metadata) to the index */ + documents = TextParser.genericParseSource(new AnchorURL(response.url()), + response.getMimeType(), + response.getCharacterEncoding(), + response.profile().ignoreDivClassName(), + response.profile().scraper(), + response.profile().timezoneOffset(), + response.depth(), + response.getContent()); + } else { + this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1); + return null; + } + } else { + // parse the document + documents = + TextParser.parseSource( + new AnchorURL(response.url()), + response.getMimeType(), + response.getCharacterEncoding(), + response.profile().ignoreDivClassName(), + response.profile().scraper(), + response.profile().timezoneOffset(), + response.depth(), + response.getContent()); + } if ( documents == null ) { throw new Parser.Failure("Parser returned null.", response.url()); } @@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch { // get the hyperlinks final Map hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow()); - if (response.profile().indexMedia()) { - for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { - if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - } + final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */ + || response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */; + + /* Handle media links */ + + for (Map.Entry entry : Document.getImagelinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getApplinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getVideolinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } + + for (Map.Entry entry : Document.getAudiolinks(documents).entrySet()) { + if (addAllLinksToCrawlStack + || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) { + hl.put(new AnchorURL(entry.getKey()), entry.getValue()); + } + } - // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links - if (response.profile().directDocByURL()) { - for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { - if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue()); - } - for (Map.Entry d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); - for (Map.Entry d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); - for (Map.Entry d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); - } - // insert those hyperlinks to the crawler MultiProtocolURL nextUrl; for ( final Map.Entry nextEntry : hl.entrySet() ) {