+
+
+ Not loading URLs with unsupported file extension is faster but less accurate.
+ Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
+
https://en.wikipedia.org/wiki/Ask.com : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html
The filter is a regular expression.
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java
index 6f463218c..54cc6b234 100644
--- a/htroot/CrawlStartExpert.java
+++ b/htroot/CrawlStartExpert.java
@@ -213,6 +213,13 @@ public class CrawlStartExpert {
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
+
+ // always cross-check URL file extension against actual Media Type ?
+ if (post == null) {
+ prop.put("crawlerAlwaysCheckMediaType", true);
+ } else {
+ prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
+ }
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 21c3c883b..343ecfed5 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -332,7 +332,7 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
- boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
+ boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available
env.setConfig("crawlingDirectDocByURL", directDocByURL);
final String collection = post.get("collection", "user");
@@ -633,6 +633,8 @@ public class Crawler_p {
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
+ profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
+ post.getBoolean("crawlerAlwaysCheckMediaType"));
handle = ASCII.getBytes(profile.handle());
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 97d35e005..9f44933c6 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask{
return error;
}
- // check availability of parser and maxfilesize
String warning = null;
- //ContentDomain contentDomain = entry.url().getContentDomainFromExt();
- if (TextParser.supportsExtension(entry.url()) != null) {
- warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
- //if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
- return null;
+ if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) {
+ if(profile.isIndexNonParseableUrls()) {
+ /* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
+ warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
+ if (warning != null && CrawlStacker.log.isFine()) {
+ CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning);
+ }
+ return null;
+ }
+
+ error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
+ CrawlStacker.log.info(error);
+ return error;
}
if (global) {
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 488c24de4..ded1b764b 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
FOLLOW_FRAMES ("followFrames", false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
OBEY_HTML_ROBOTS_NOINDEX ("obeyHtmlRobotsNoindex", false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
OBEY_HTML_ROBOTS_NOFOLLOW ("obeyHtmlRobotsNofollow", false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
+ CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"),
CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"),
CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"),
@@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(CrawlAttribute.HANDLE.key, handle);
put(CrawlAttribute.NAME.key, name);
put(CrawlAttribute.AGENT_NAME.key, userAgentName);
+ put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap implements M
}
}
- public boolean directDocByURL() {
+ /**
+ * @return true when URLs of unsupported resources (no parser available or denied format) should
+ * be indexed as links (with metadata only on URL and not on content).
+ */
+ public boolean isIndexNonParseableUrls() {
final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
+
+ /**
+ * @return true when the crawler must always cross check the eventual URL file
+ * extension against the actual Media Type, even when file extension is
+ * unknown or unsupported. False when the crawler should not load URLs
+ * with an unknown or unsupported file extension.
+ */
+ public boolean isCrawlerAlwaysCheckMediaType() {
+ final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
+ if (r == null) {
+ return false;
+ }
+ return (r.equals(Boolean.TRUE.toString()));
+ }
public CacheStrategy cacheStrategy() {
final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
- prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
+ prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
@@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
+ prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));
diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java
index 35a1b5c15..32248b748 100644
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@@ -55,10 +55,18 @@ public class NoticedURL {
LOCAL, GLOBAL, REMOTE, NOLOAD;
}
- private Balancer coreStack; // links found by crawling to depth-1
- private Balancer limitStack; // links found by crawling at target depth
- private Balancer remoteStack; // links from remote crawl orders (init on demand)
- private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
+ /** links found by crawling to depth-1 */
+ private Balancer coreStack;
+
+ /** links found by crawling at target depth */
+ private Balancer limitStack;
+
+ /** links from remote crawl orders (init on demand) */
+ private Balancer remoteStack;
+
+ /** links that are not passed to a loader; the index will be generated from the Request entry */
+ private Balancer noloadStack;
+
private final File cachePath;
protected NoticedURL(
diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java
index 5a11aa24b..078b2f0f4 100644
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@@ -742,8 +742,12 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
- // check if document can be indexed
- if (this.responseHeader != null) {
+ /*
+ * Eventually check if a parser supports the media yype. Depending on the crawl
+ * profile, the indexingDocumentProcessor can eventually index only URL metadata
+ * using the generic parser for unsupported media types
+ */
+ if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError;
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index 3ee564da2..de1486392 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -241,6 +241,29 @@ public final class TextParser {
return docs;
}
+ /**
+ * Apply only the generic parser to the given content from location.
+ */
+ public static Document[] genericParseSource(
+ final DigestURL location,
+ String mimeType,
+ final String charset,
+ final Set ignoreClassNames,
+ final VocabularyScraper scraper,
+ final int timezoneOffset,
+ final int depth,
+ final byte[] content
+ ) throws Parser.Failure {
+ if (AbstractParser.log.isFine()) {
+ AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
+ }
+ mimeType = normalizeMimeType(mimeType);
+ Set idioms = new HashSet<>();
+ idioms.add(TextParser.genericIdiom);
+
+ return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+ }
+
private static Document[] parseSource(
final DigestURL location,
String mimeType,
@@ -644,7 +667,7 @@ public final class TextParser {
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
- * @throws Parser.Failure
+ * @throws Parser.Failure when the file extension or the MIME type is denied
*/
private static Set parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
final Set idioms = new LinkedHashSet(2); // LinkedSet to maintain order (genericParser should be last)
@@ -661,7 +684,12 @@ public final class TextParser {
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) {
- if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+ /* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
+ * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
+ * Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
+ if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
+ throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
+ }
idiom = ext2parser.get(ext);
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
idioms.addAll(idiom);
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 19b6223cf..bbe96ee7c 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch {
noIndexReason = response.shallIndexCacheForCrawler();
}
- // check if the parser supports the mime type
- if ( noIndexReason == null ) {
+ /*
+ * Eventually check if a parser supports the media type. Depending on the crawl
+ * profile, the indexingDocumentProcessor can eventually index only URL metadata
+ * using the generic parser for unsupported media types
+ */
+ if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) {
noIndexReason = TextParser.supports(response.url(), response.getMimeType());
}
@@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch {
}
}
assert response.getContent() != null;
+
try {
- // parse the document
- documents =
- TextParser.parseSource(
- new AnchorURL(response.url()),
- response.getMimeType(),
- response.getCharacterEncoding(),
- response.profile().ignoreDivClassName(),
- response.profile().scraper(),
- response.profile().timezoneOffset(),
- response.depth(),
- response.getContent());
+ final String supportError = TextParser.supports(response.url(), response.getMimeType());
+ if (supportError != null) {
+ /* No parser available or format is denied */
+ if(response.profile().isIndexNonParseableUrls()) {
+ /* Apply the generic parser add the URL as a simple link (no content metadata) to the index */
+ documents = TextParser.genericParseSource(new AnchorURL(response.url()),
+ response.getMimeType(),
+ response.getCharacterEncoding(),
+ response.profile().ignoreDivClassName(),
+ response.profile().scraper(),
+ response.profile().timezoneOffset(),
+ response.depth(),
+ response.getContent());
+ } else {
+ this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError);
+ // create a new errorURL DB entry
+ this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1);
+ return null;
+ }
+ } else {
+ // parse the document
+ documents =
+ TextParser.parseSource(
+ new AnchorURL(response.url()),
+ response.getMimeType(),
+ response.getCharacterEncoding(),
+ response.profile().ignoreDivClassName(),
+ response.profile().scraper(),
+ response.profile().timezoneOffset(),
+ response.depth(),
+ response.getContent());
+ }
if ( documents == null ) {
throw new Parser.Failure("Parser returned null.", response.url());
}
@@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch {
// get the hyperlinks
final Map hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
- if (response.profile().indexMedia()) {
- for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) {
- if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
- }
- }
+ final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
+ || response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
+
+ /* Handle media links */
+
+ for (Map.Entry entry : Document.getImagelinks(documents).entrySet()) {
+ if (addAllLinksToCrawlStack
+ || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+ hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+ }
+ }
+
+ for (Map.Entry entry : Document.getApplinks(documents).entrySet()) {
+ if (addAllLinksToCrawlStack
+ || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+ hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+ }
+ }
+
+ for (Map.Entry entry : Document.getVideolinks(documents).entrySet()) {
+ if (addAllLinksToCrawlStack
+ || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+ hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+ }
+ }
+
+ for (Map.Entry entry : Document.getAudiolinks(documents).entrySet()) {
+ if (addAllLinksToCrawlStack
+ || (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
+ hl.put(new AnchorURL(entry.getKey()), entry.getValue());
+ }
+ }
- // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
- if (response.profile().directDocByURL()) {
- for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) {
- if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
- }
- for (Map.Entry d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
- for (Map.Entry d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
- for (Map.Entry d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
- }
-
// insert those hyperlinks to the crawler
MultiProtocolURL nextUrl;
for ( final Map.Entry nextEntry : hl.entrySet() ) {