Added new crawler attribute for finer control over Media Type detection

New "Media Type detection" section in the advanced crawl start page
allow to choose between :
- not loading URLs with unknown or unsupported file extension without
checking the actual Media Type (relying Content-Type header for now).
This was the old default behavior, faster, but not really accurate.
- always cross check URL file extension against the actual Media Type.
This lets properly parse URLs ending with an apparently odd file
extension, but which have actually a supported Media Type such as
text/html.

Sample URLs with misleading file extensions added as documentation in
the crawl start page.

fixes issue #244
pull/250/head
luccioman 6 years ago
parent 88d0ed676c
commit fcf6b16db4

@ -20,6 +20,7 @@
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>

@ -317,6 +317,27 @@
Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
</dd>
<dt>Media Type detection</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type checking info"/>
<span style="right:0px; width:30em;" id="mediaTypeCheckingInfo">
Not loading URLs with unsupported file extension is faster but less accurate.
Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
<ul>
<li><a href="https://en.wikipedia.org/wiki/.de" target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
<li><a href="https://en.wikipedia.org/wiki/Ask.com" target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
<li><a href="https://commons.wikimedia.org/wiki/File:YaCy_logo.png" target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
</ul>
</span>
</div>
<label>
<input type="radio" aria-describedby="mediaTypeCheckingInfo" name="crawlerAlwaysCheckMediaType" value="false" #(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)# /> Do not load URLs with an unsupported file extension
</label>
<label>
<input type="radio" name="crawlerAlwaysCheckMediaType" value="true" #(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)# /> Always cross check file extension against Content-Type header
</label>
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>.

@ -213,6 +213,13 @@ public class CrawlStartExpert {
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
// always cross-check URL file extension against actual Media Type ?
if (post == null) {
prop.put("crawlerAlwaysCheckMediaType", true);
} else {
prop.put("crawlerAlwaysCheckMediaType", post.getBoolean("crawlerAlwaysCheckMediaType"));
}
// Load Filter on URLs (range)
if (post != null && post.containsKey("range")) {

@ -332,7 +332,7 @@ public class Crawler_p {
env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents without loading them
boolean directDocByURL = "on".equals(post.get("directDocByURL", "off")); // catch also all linked media documents even when no parser is available
env.setConfig("crawlingDirectDocByURL", directDocByURL);
final String collection = post.get("collection", "user");
@ -633,6 +633,8 @@ public class Crawler_p {
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
profile.put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key,
post.getBoolean("crawlerAlwaysCheckMediaType"));
handle = ASCII.getBytes(profile.handle());

@ -374,13 +374,20 @@ public final class CrawlStacker implements WorkflowTask<Request>{
return error;
}
// check availability of parser and maxfilesize
String warning = null;
//ContentDomain contentDomain = entry.url().getContentDomainFromExt();
if (TextParser.supportsExtension(entry.url()) != null) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
//if (warning != null && this.log.isFine()) this.log.logFine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;
if (!profile.isCrawlerAlwaysCheckMediaType() && TextParser.supportsExtension(entry.url()) != null) {
if(profile.isIndexNonParseableUrls()) {
/* Unsupported file extension and no cross-checking of Media Type : add immediately to the noload stack to index only URL metadata */
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry, profile, this.robots);
if (warning != null && CrawlStacker.log.isFine()) {
CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true) + " - not pushed to " + NoticedURL.StackType.NOLOAD + " stack : " + warning);
}
return null;
}
error = "URL '" + entry.url().toString() + "' file extension is not supported and indexing of linked non-parsable documents is disabled.";
CrawlStacker.log.info(error);
return error;
}
if (global) {

@ -96,6 +96,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
FOLLOW_FRAMES ("followFrames", false, CrawlAttribute.BOOLEAN, "Flag if frames shall be followed (no by default)"),
OBEY_HTML_ROBOTS_NOINDEX ("obeyHtmlRobotsNoindex", false, CrawlAttribute.BOOLEAN, "Obey html-robots-noindex"),
OBEY_HTML_ROBOTS_NOFOLLOW ("obeyHtmlRobotsNofollow", false, CrawlAttribute.BOOLEAN, "Obey html-robots-nofollow"),
CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType", false, CrawlAttribute.BOOLEAN, "Always cross check file extension against actual Media Type"),
CRAWLER_URL_MUSTMATCH ("crawlerURLMustMatch", false, CrawlAttribute.STRING, "URL Must-Match Filter"),
CRAWLER_URL_MUSTNOTMATCH ("crawlerURLMustNotMatch", false, CrawlAttribute.STRING, "URL Must-Not-Match Filter"),
CRAWLER_IP_MUSTMATCH ("crawlerIPMustMatch", false, CrawlAttribute.STRING, "IP Must-Match Filter"),
@ -239,6 +240,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.HANDLE.key, handle);
put(CrawlAttribute.NAME.key, name);
put(CrawlAttribute.AGENT_NAME.key, userAgentName);
put(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key, true);
put(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key, (crawlerUrlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerUrlMustMatch);
put(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key, (crawlerUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerUrlMustNotMatch);
put(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key, (crawlerIpMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : crawlerIpMustMatch);
@ -673,11 +675,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
}
public boolean directDocByURL() {
/**
* @return true when URLs of unsupported resources (no parser available or denied format) should
* be indexed as links (with metadata only on URL and not on content).
*/
public boolean isIndexNonParseableUrls() {
final String r = get(CrawlAttribute.DIRECT_DOC_BY_URL.key);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
/**
* @return true when the crawler must always cross check the eventual URL file
* extension against the actual Media Type, even when file extension is
* unknown or unsupported. False when the crawler should not load URLs
* with an unknown or unsupported file extension.
*/
public boolean isCrawlerAlwaysCheckMediaType() {
final String r = get(CrawlAttribute.CRAWLER_ALWAYS_CHECK_MEDIA_TYPE.key);
if (r == null) {
return false;
}
return (r.equals(Boolean.TRUE.toString()));
}
public CacheStrategy cacheStrategy() {
final String r = get(CrawlAttribute.CACHE_STRAGEGY.key);
@ -889,7 +909,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_agentName", this.get(CrawlAttribute.AGENT_NAME.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_userAgent", this.getAgent().userAgent);
prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth());
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.directDocByURL() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_directDocByURL", this.isIndexNonParseableUrls() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_recrawlIfOlder", this.recrawlIfOlder() == Long.MAX_VALUE ? "eternity" : (new Date(this.recrawlIfOlder()).toString()));
prop.put(CRAWL_PROFILE_PREFIX + count + "_domMaxPages", this.domMaxPages());
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomMaxPages", (this.domMaxPages() == Integer.MAX_VALUE) ? "unlimited" : Integer.toString(this.domMaxPages())); // TODO: remove, replace with 'domMaxPages'
@ -903,6 +923,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_storeHTCache", this.storeHTCache() ? 1 : 0);
prop.put(CRAWL_PROFILE_PREFIX + count + "_remoteIndexing", this.remoteIndexing() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_cacheStrategy", this.get(CrawlAttribute.CACHE_STRAGEGY.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlerAlwaysCheckMediaType", this.isCrawlerAlwaysCheckMediaType());
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerURLMustNotMatch", this.get(CrawlAttribute.CRAWLER_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_crawlerIPMustMatch", this.get(CrawlAttribute.CRAWLER_IP_MUSTMATCH.key));

@ -55,10 +55,18 @@ public class NoticedURL {
LOCAL, GLOBAL, REMOTE, NOLOAD;
}
private Balancer coreStack; // links found by crawling to depth-1
private Balancer limitStack; // links found by crawling at target depth
private Balancer remoteStack; // links from remote crawl orders (init on demand)
private Balancer noloadStack; // links that are not passed to a loader; the index will be generated from the Request entry
/** links found by crawling to depth-1 */
private Balancer coreStack;
/** links found by crawling at target depth */
private Balancer limitStack;
/** links from remote crawl orders (init on demand) */
private Balancer remoteStack;
/** links that are not passed to a loader; the index will be generated from the Request entry */
private Balancer noloadStack;
private final File cachePath;
protected NoticedURL(

@ -742,8 +742,12 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
// check if document can be indexed
if (this.responseHeader != null) {
/*
* Eventually check if a parser supports the media yype. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType);
if (parserError != null && TextParser.supportsExtension(url()) != null) return "no parser available: " + parserError;

@ -241,6 +241,29 @@ public final class TextParser {
return docs;
}
/**
* Apply only the generic parser to the given content from location.
*/
public static Document[] genericParseSource(
final DigestURL location,
String mimeType,
final String charset,
final Set<String> ignoreClassNames,
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final byte[] content
) throws Parser.Failure {
if (AbstractParser.log.isFine()) {
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
}
mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
}
private static Document[] parseSource(
final DigestURL location,
String mimeType,
@ -644,7 +667,7 @@ public final class TextParser {
* @param url the given url
* @param mimeType the given mime type
* @return a list of Idiom parsers that may be appropriate for the given criteria
* @throws Parser.Failure
* @throws Parser.Failure when the file extension or the MIME type is denied
*/
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
@ -661,7 +684,12 @@ public final class TextParser {
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) {
if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
* Notable example : wikimedia commons pages, such as https://commons.wikimedia.org/wiki/File:YaCy_logo.png */
if (denyExtensionx.containsKey(ext) && (mimeType1 == null || mimeType1.equals(mimeOf(ext)))) {
throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
}
idiom = ext2parser.get(ext);
if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
idioms.addAll(idiom);

@ -2071,8 +2071,12 @@ public final class Switchboard extends serverSwitch {
noIndexReason = response.shallIndexCacheForCrawler();
}
// check if the parser supports the mime type
if ( noIndexReason == null ) {
/*
* Eventually check if a parser supports the media type. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
if ( noIndexReason == null && !response.profile().isIndexNonParseableUrls()) {
noIndexReason = TextParser.supports(response.url(), response.getMimeType());
}
@ -3009,18 +3013,40 @@ public final class Switchboard extends serverSwitch {
}
}
assert response.getContent() != null;
try {
// parse the document
documents =
TextParser.parseSource(
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
final String supportError = TextParser.supports(response.url(), response.getMimeType());
if (supportError != null) {
/* No parser available or format is denied */
if(response.profile().isIndexNonParseableUrls()) {
/* Apply the generic parser add the URL as a simple link (no content metadata) to the index */
documents = TextParser.genericParseSource(new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
} else {
this.log.warn("Resource '" + response.url().toNormalform(true) + "' is not supported. " + supportError);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(response.url(), response.depth(), response.profile(), FailCategory.FINAL_PROCESS_CONTEXT, supportError, -1);
return null;
}
} else {
// parse the document
documents =
TextParser.parseSource(
new AnchorURL(response.url()),
response.getMimeType(),
response.getCharacterEncoding(),
response.profile().ignoreDivClassName(),
response.profile().scraper(),
response.profile().timezoneOffset(),
response.depth(),
response.getContent());
}
if ( documents == null ) {
throw new Parser.Failure("Parser returned null.", response.url());
}
@ -3070,22 +3096,39 @@ public final class Switchboard extends serverSwitch {
// get the hyperlinks
final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
if (response.profile().indexMedia()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
final boolean addAllLinksToCrawlStack = response.profile().isIndexNonParseableUrls() /* unsupported resources have to be indexed as pure links if no parser support them */
|| response.profile().isCrawlerAlwaysCheckMediaType() /* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
/* Handle media links */
for (Map.Entry<DigestURL, String> entry : Document.getImagelinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getApplinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getVideolinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
for (Map.Entry<DigestURL, String> entry : Document.getAudiolinks(documents).entrySet()) {
if (addAllLinksToCrawlStack
|| (response.profile().indexMedia() && TextParser.supportsExtension(entry.getKey()) == null)) {
hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
}
// insert those hyperlinks to the crawler
MultiProtocolURL nextUrl;
for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {

Loading…
Cancel
Save