Not loading URLs with unsupported file extension is faster but less accurate.
Indeed, for some web resources the actual Media Type is not consistent with the URL file extension. Here are some examples:
<ul>
<li><ahref="https://en.wikipedia.org/wiki/.de"target="_blank">https://en.wikipedia.org/wiki/.de</a> : the .de extension is unknown, but the actual Media Type of this page is text/html</li>
<li><ahref="https://en.wikipedia.org/wiki/Ask.com"target="_blank">https://en.wikipedia.org/wiki/Ask.com</a> : the .com extension is not supported (executable file format), but the actual Media Type of this page is text/html</li>
<li><ahref="https://commons.wikimedia.org/wiki/File:YaCy_logo.png"target="_blank">https://commons.wikimedia.org/wiki/File:YaCy_logo.png</a> : the .png extension is a supported image format, but the actual Media Type of this page is text/html</li>
</ul>
</span>
</div>
<label>
<inputtype="radio"aria-describedby="mediaTypeCheckingInfo"name="crawlerAlwaysCheckMediaType"value="false"#(crawlerAlwaysCheckMediaType)#checked="checked"::#(/crawlerAlwaysCheckMediaType)#/> Do not load URLs with an unsupported file extension
</label>
<label>
<inputtype="radio"name="crawlerAlwaysCheckMediaType"value="true"#(crawlerAlwaysCheckMediaType)#::checked="checked"#(/crawlerAlwaysCheckMediaType)#/> Always cross check file extension against Content-Type header
.text(function(d){/* Limit the length of nodes visible text to improve readability */returnd.name.substring(0,Math.min(d.name.length,maxTextLength));});
text.append("tspan")
.attr("class","truncated")
.text(function(d){/* The end of large texts is wraped in a tspan, made visible on mouse overing */returnd.name.length>maxTextLength?d.name.substring(maxTextLength):""});
text.append("tspan")
.attr("class","ellipsis")
.text(function(d){/* Add an ellipsis to mark long texts that are truncated */returnd.name.length>maxTextLength?"...":""});
"; "+"java "+System.getProperty("java.version","no-java-version")+"; "+generateLocation();// keep this before the following static initialization block as this constant is used by generateYaCyBot()
static{
generateYaCyBot("new");
@ -87,13 +94,6 @@ public class ClientIdentification {
CrawlStacker.log.fine("CrawlStacker.stackCrawl of URL "+entry.url().toNormalform(true)+" - not pushed to "+NoticedURL.StackType.NOLOAD+" stack : "+warning);
}
returnnull;
}
error="URL '"+entry.url().toString()+"' file extension is not supported and indexing of linked non-parsable documents is disabled.";
CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType",false,CrawlAttribute.BOOLEAN,"Always cross check file extension against actual Media Type"),
finalbooleanaddAllLinksToCrawlStack=response.profile().isIndexNonParseableUrls()/* unsupported resources have to be indexed as pure links if no parser support them */
||response.profile().isCrawlerAlwaysCheckMediaType()/* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
q.append(CollectionSchema.host_s.getSolrFieldName()).append(':').append(hostname).append(" OR ").append(CollectionSchema.host_s.getSolrFieldName()).append(':').append("www.").append(hostname);