added canonical filter

attention: this is on by default!
(it should do the right thing)
pull/554/head
Michael Peter Christen 2 years ago
parent 5a52b01c09
commit 9fcd8f1bda

@ -1,48 +1,49 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?> <?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<crawlProfiles> <crawlProfiles>
#{crawlProfiles}# #{crawlProfiles}#
<crawlProfile> <crawlProfile>
<handle>#[handle]#</handle> <handle>#[handle]#</handle>
<name>#[name]#</name> <name>#[name]#</name>
<collections>#[collections]#</collections> <collections>#[collections]#</collections>
<agentName>#[agentName]#</agentName> <agentName>#[agentName]#</agentName>
<userAgent>#[userAgent]#</userAgent> <userAgent>#[userAgent]#</userAgent>
<depth>#[depth]#</depth> <depth>#[depth]#</depth>
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL> <directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder> <recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
<domMaxPages>#[domMaxPages]#</domMaxPages> <domMaxPages>#[domMaxPages]#</domMaxPages>
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ> <crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames> <followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex> <obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow> <obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
<indexText>#(indexText)#false::true#(/indexText)#</indexText> <indexText>#(indexText)#false::true#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia> <indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache> <storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing> <remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy> <cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType> <crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch> <crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch> <crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch> <crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch> <crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch> <crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch> <crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch> <crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch> <crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch> <indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch> <indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch> <indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch> <indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch> <indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch> <indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch> <indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch> <indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status> <noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
<crawlingDomFilterContent> <status>#(status)#terminated::active::system#(/status)#</status>
#{crawlingDomFilterContent}# <crawlingDomFilterContent>
<item>#[item]#</item> #{crawlingDomFilterContent}#
#{/crawlingDomFilterContent}# <item>#[item]#</item>
</crawlingDomFilterContent> #{/crawlingDomFilterContent}#
</crawlProfile> </crawlingDomFilterContent>
</crawlProfile>
#{/crawlProfiles}# #{/crawlProfiles}#
</crawlProfiles> </crawlProfiles>

@ -412,6 +412,9 @@
<table style="border-width: 0px"> <table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr> <tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr> <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
<tr>
<td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
</tr>
</table> </table>
</dd> </dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt> <dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
<tr> <tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td> <td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td> <td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" /> <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
</td> </td>
</tr> </tr>
#(/embeddedSolrConnected)# #(/embeddedSolrConnected)#

@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
// check if ip is local ip address // check if ip is local ip address
final String urlRejectReason = this.urlInAcceptedDomain(url); final String urlRejectReason = this.urlInAcceptedDomain(url);
if (urlRejectReason != null) { if (urlRejectReason != null) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")"); if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")"; return "denied_(" + urlRejectReason + ")";
} }

@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true, true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true, true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")), Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
null, null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
null, null,

@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1, 0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,

@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"), INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"), INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"), INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"), RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"), STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"), CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch, final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch, final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch, final String indexContentMustMatch, final String indexContentMustNotMatch,
final boolean noindexWhenCanonicalUnequalURL,
final int depth, final int depth,
final boolean directDocByURL, final boolean directDocByURL,
final Date recrawlIfOlder /*date*/, final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
} }
/** /**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean noindexWhenCanonicalUnequalURL() {
final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() { public boolean storeHTCache() {
final String r = get(CrawlAttribute.STORE_HTCACHE.key); final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false; if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));

@ -175,10 +175,10 @@ public class Response {
int p = mime.indexOf('/'); int p = mime.indexOf('/');
if (p < 0) return new String[]{mime}; if (p < 0) return new String[]{mime};
if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)}; if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)}; if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)}; if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)}; if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
return new String[]{mime}; return new String[]{mime};
} }
public static final int QUEUE_STATE_FRESH = 0; public static final int QUEUE_STATE_FRESH = 0;
@ -235,16 +235,16 @@ public class Response {
* @return the original request that produced this response * @return the original request that produced this response
*/ */
public Request getRequest() { public Request getRequest() {
return request; return request;
} }
public ResponseHeader getResponseHeader() { public ResponseHeader getResponseHeader() {
return this.responseHeader; return this.responseHeader;
} }
public RequestHeader getRequestHeader() { public RequestHeader getRequestHeader() {
return this.requestHeader; return this.requestHeader;
} }
public boolean fromCache() { public boolean fromCache() {
return this.fromCache; return this.fromCache;
@ -260,11 +260,11 @@ public class Response {
return this.request.name(); return this.request.name();
} }
/** /**
* @return the requested URL that produced this response. When redirection(s) * @return the requested URL that produced this response. When redirection(s)
* occurred, this is not the initial URL, but the last redirection * occurred, this is not the initial URL, but the last redirection
* target. * target.
*/ */
public DigestURL url() { public DigestURL url() {
return this.request.url(); return this.request.url();
} }
@ -745,11 +745,11 @@ public class Response {
// -ranges in request // -ranges in request
// we checked that in shallStoreCache // we checked that in shallStoreCache
/* /*
* Eventually check if a parser supports the media yype. Depending on the crawl * Eventually check if a parser supports the media yype. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata * profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types * using the generic parser for unsupported media types
*/ */
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) { if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType(); final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType); final String parserError = TextParser.supportsMime(mimeType);

@ -91,12 +91,12 @@ public class Document {
/** links to icons that belongs to the document (mapped by absolute URL) */ /** links to icons that belongs to the document (mapped by absolute URL) */
private Map<DigestURL, IconEntry> icons; private Map<DigestURL, IconEntry> icons;
/** /**
* URLs of linked data item types/classes referenced by the document (for example in * URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or * HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD) * JSON-LD)
*/ */
private Set<DigestURL> linkedDataTypes; private Set<DigestURL> linkedDataTypes;
private boolean resorted; private boolean resorted;
private final Set<String> languages; private final Set<String> languages;
private boolean indexingDenied; private boolean indexingDenied;
@ -131,13 +131,13 @@ public class Document {
this.parserObject = parserObject; this.parserObject = parserObject;
this.keywords = new LinkedHashSet<String>(); this.keywords = new LinkedHashSet<String>();
if (keywords != null) { if (keywords != null) {
Collections.addAll(this.keywords, keywords); Collections.addAll(this.keywords, keywords);
} }
this.titles = (titles == null) ? new ArrayList<String>(1) : titles; this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = new LinkedList<String>() ; this.sections = new LinkedList<String>() ;
if (sections != null) { if (sections != null) {
Collections.addAll(this.sections, sections); Collections.addAll(this.sections, sections);
} }
this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts; this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) { if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
@ -216,13 +216,21 @@ public class Document {
} }
this.scraperObject = scraper; this.scraperObject = scraper;
} }
public AnchorURL getCanonical() {
final Object scraper = this.getScraperObject();
if (!(scraper instanceof ContentScraper)) return null;
final ContentScraper html = (ContentScraper) scraper;
AnchorURL canonical = html.getCanonical();
return canonical;
}
public Set<String> getContentLanguages() { public Set<String> getContentLanguages() {
return this.languages; return this.languages;
} }
public String getFileName() { public String getFileName() {
return this.source.getFileName(); return this.source.getFileName();
} }
public Map<String, Set<String>> getGenericFacets() { public Map<String, Set<String>> getGenericFacets() {
@ -233,15 +241,15 @@ public class Document {
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit * @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/ */
public boolean isPartiallyParsed() { public boolean isPartiallyParsed() {
return this.partiallyParsed; return this.partiallyParsed;
} }
/** /**
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit * @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/ */
public void setPartiallyParsed(final boolean partiallyParsed) { public void setPartiallyParsed(final boolean partiallyParsed) {
this.partiallyParsed = partiallyParsed; this.partiallyParsed = partiallyParsed;
} }
/** /**
* compute a set of languages that this document contains * compute a set of languages that this document contains
@ -637,13 +645,13 @@ dc_rights
// we add artificial hyperlinks to the hyperlink set // we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks // that can be calculated from given hyperlinks and imagelinks
/* /*
* Should we also include icons ? with * Should we also include icons ? with
* this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
* problematic as allReflinks will modify icons set set, removing those whose URL is * problematic as allReflinks will modify icons set set, removing those whose URL is
* starting with "/www" but it is not desired for icons such as * starting with "/www" but it is not desired for icons such as
* www.wikipedia.org/static/favicon/wikipedia.ico * www.wikipedia.org/static/favicon/wikipedia.ico
*/ */
this.hyperlinks.putAll(allReflinks(this.images.values())); this.hyperlinks.putAll(allReflinks(this.images.values()));
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet())); this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
@ -804,16 +812,16 @@ dc_rights
} }
InputStream textStream = doc.getTextStream(); InputStream textStream = doc.getTextStream();
try { try {
FileUtils.copy(textStream, (ByteArrayOutputStream) this.text); FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
} finally { } finally {
try { try {
if(textStream != null) { if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close(); textStream.close();
} }
} catch(IOException e) { } catch(IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
} }
} }
this.anchors.addAll(doc.getAnchors()); this.anchors.addAll(doc.getAnchors());
@ -826,41 +834,41 @@ dc_rights
* @return links to icons that belongs to the document (mapped by absolute URL) * @return links to icons that belongs to the document (mapped by absolute URL)
*/ */
public Map<DigestURL, IconEntry> getIcons() { public Map<DigestURL, IconEntry> getIcons() {
return icons; return icons;
} }
/** /**
* Set links to icons that belongs to the document (mapped by absolute URL) * Set links to icons that belongs to the document (mapped by absolute URL)
* @param icons * @param icons
*/ */
public void setIcons(final Map<DigestURL, IconEntry> icons) { public void setIcons(final Map<DigestURL, IconEntry> icons) {
/* Better to ensure now icons property will not be null */ /* Better to ensure now icons property will not be null */
if(icons != null) { if(icons != null) {
this.icons = icons; this.icons = icons;
} else { } else {
this.icons = new HashMap<>(); this.icons = new HashMap<>();
} }
} }
/** /**
* @return URLs of linked data item types/classes referenced by the document (for example in * @return URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or * HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD) * JSON-LD)
*/ */
public Set<DigestURL> getLinkedDataTypes() { public Set<DigestURL> getLinkedDataTypes() {
return this.linkedDataTypes; return this.linkedDataTypes;
} }
/** /**
* @return URLs of linked data item types/classes referenced by the document * @return URLs of linked data item types/classes referenced by the document
*/ */
public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) { public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) {
if(linkedDataTypes != null) { if(linkedDataTypes != null) {
/* Ensure non null property */ /* Ensure non null property */
this.linkedDataTypes = linkedDataTypes; this.linkedDataTypes = linkedDataTypes;
} else { } else {
this.linkedDataTypes.clear(); this.linkedDataTypes.clear();
} }
} }
@ -1034,14 +1042,14 @@ dc_rights
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} finally { } finally {
try { try {
if(textStream != null) { if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close(); textStream.close();
} }
} catch (IOException e) { } catch (IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
} }
} }
} }
anchors.addAll(doc.getAnchors()); anchors.addAll(doc.getAnchors());
@ -1098,7 +1106,7 @@ dc_rights
public final static String IFRAME_MARKER = "iframe"; public final static String IFRAME_MARKER = "iframe";
public final static String FRAME_MARKER = "frame"; public final static String FRAME_MARKER = "frame";
public final static String EMBED_MARKER = "embed"; public final static String EMBED_MARKER = "embed";
public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) { public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
final Map<AnchorURL, String> result = new HashMap<>(); final Map<AnchorURL, String> result = new HashMap<>();
for (final Document d: documents) { for (final Document d: documents) {

@ -369,6 +369,13 @@ public class CrawlStartExpert {
} }
} }
// Check Canonical?
if (post == null) {
prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
} else {
prop.put("noindexWhenCanonicalUnequalURLChecked",
post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
}
// ---------- Clean-Up before Crawl Start // ---------- Clean-Up before Crawl Start
// delete if older settings: number value // delete if older settings: number value

@ -316,6 +316,7 @@ public class Crawler_p {
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING); final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder); env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
indexUrlMustNotMatch, indexUrlMustNotMatch,
indexContentMustMatch, indexContentMustMatch,
indexContentMustNotMatch, indexContentMustNotMatch,
noindexWhenCanonicalUnequalURL,
newcrawlingdepth, newcrawlingdepth,
directDocByURL, directDocByURL,
crawlingIfOlder, crawlingIfOlder,

@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
false,
CrawlingDepth, CrawlingDepth,
true, true,
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month

@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
return new IndexingQueueEntry(in.queueEntry, in.documents, null); return new IndexingQueueEntry(in.queueEntry, in.documents, null);
} }
} }
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { // check mustmatch pattern
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null); return new IndexingQueueEntry(in.queueEntry, in.documents, null);
} }
// check which files may take part in the indexing process // check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<>(); final List<Document> doclist = new ArrayList<>();
docloop: for (final Document document : in.documents) { docloop: for (final Document document : in.documents) {
// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
}
}
// check indexing denied flags
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) { if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop; continue docloop;
} }
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { // check content pattern must-match
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop; continue docloop;
} }

Loading…
Cancel
Save