For finer control over which parsed documents can trigger an addition of
their links to the crawl stack, complementary to the existing crawl
depth parameter.
<td><inputname="crawlerOriginURLMustMatch"id="crawlerOriginURLMustMatch"type="text"size="55"maxlength="100000"value="#[crawlerOriginURLMustMatch]#"onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td>
@ -99,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
CRAWLER_ALWAYS_CHECK_MEDIA_TYPE("crawlerAlwaysCheckMediaType",false,CrawlAttribute.BOOLEAN,"Always cross check file extension against actual Media Type"),
finalbooleanaddAllLinksToCrawlStack=response.profile().isIndexNonParseableUrls()/* unsupported resources have to be indexed as pure links if no parser support them */
||response.profile().isCrawlerAlwaysCheckMediaType()/* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
// enqueue the hyperlink into the pre-notice-url db
intnextdepth=nextEntry.getValue()!=null&&nextEntry.getValue().equals(Document.CANONICAL_MARKER)?response.depth():response.depth()+1;// canonical documents are on the same depth
try{
this.crawlStacker.enqueueEntry(newRequest(
response.initiator(),
newDigestURL(u),
response.url().hash(),
nextEntry.getValue(),
newDate(),
response.profile().handle(),
nextdepth,
response.profile().timezoneOffset()));
}catch(finalMalformedURLExceptione){
ConcurrentLog.logException(e);
finalbooleanaddAllLinksToCrawlStack=response.profile().isIndexNonParseableUrls()/* unsupported resources have to be indexed as pure links if no parser support them */
||response.profile().isCrawlerAlwaysCheckMediaType()/* the crawler must always load resources to double-check the actual Media Type even on unsupported file extensions */;
log.info("REWRITE of url = \""+u+"\" to \""+u0+"\"");
u=u0;
}
//Matcher m = rewritePattern.matcher(u);
//if (m.matches()) u = m.replaceAll("");
// enqueue the hyperlink into the pre-notice-url db
intnextdepth=nextEntry.getValue()!=null&&nextEntry.getValue().equals(Document.CANONICAL_MARKER)?response.depth():response.depth()+1;// canonical documents are on the same depth