added canonical filter

attention: this is on by default!
(it should do the right thing)
pull/554/head
Michael Peter Christen 2 years ago
parent 5a52b01c09
commit 9fcd8f1bda

@ -37,6 +37,7 @@
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch> <indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch> <indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch> <indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
<status>#(status)#terminated::active::system#(/status)#</status> <status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent> <crawlingDomFilterContent>
#{crawlingDomFilterContent}# #{crawlingDomFilterContent}#

@ -412,6 +412,9 @@
<table style="border-width: 0px"> <table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr> <tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr> <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
<tr>
<td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
</tr>
</table> </table>
</dd> </dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt> <dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
<tr> <tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td> <td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td> <td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" /> <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
</td> </td>
</tr> </tr>
#(/embeddedSolrConnected)# #(/embeddedSolrConnected)#

@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
// check if ip is local ip address // check if ip is local ip address
final String urlRejectReason = this.urlInAcceptedDomain(url); final String urlRejectReason = this.urlInAcceptedDomain(url);
if (urlRejectReason != null) { if (urlRejectReason != null) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")"); if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")"; return "denied_(" + urlRejectReason + ")";
} }

@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true, true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true, true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")), Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
null, null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, 0,
false, false,
null, null,

@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1, 0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,

@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"), INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"), INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"), INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"), RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"), STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"), CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch, final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch, final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch, final String indexContentMustMatch, final String indexContentMustNotMatch,
final boolean noindexWhenCanonicalUnequalURL,
final int depth, final int depth,
final boolean directDocByURL, final boolean directDocByURL,
final Date recrawlIfOlder /*date*/, final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
} }
/** /**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean noindexWhenCanonicalUnequalURL() {
final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() { public boolean storeHTCache() {
final String r = get(CrawlAttribute.STORE_HTCACHE.key); final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false; if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));

@ -217,6 +217,14 @@ public class Document {
this.scraperObject = scraper; this.scraperObject = scraper;
} }
public AnchorURL getCanonical() {
final Object scraper = this.getScraperObject();
if (!(scraper instanceof ContentScraper)) return null;
final ContentScraper html = (ContentScraper) scraper;
AnchorURL canonical = html.getCanonical();
return canonical;
}
public Set<String> getContentLanguages() { public Set<String> getContentLanguages() {
return this.languages; return this.languages;
} }

@ -369,6 +369,13 @@ public class CrawlStartExpert {
} }
} }
// Check Canonical?
if (post == null) {
prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
} else {
prop.put("noindexWhenCanonicalUnequalURLChecked",
post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
}
// ---------- Clean-Up before Crawl Start // ---------- Clean-Up before Crawl Start
// delete if older settings: number value // delete if older settings: number value

@ -316,6 +316,7 @@ public class Crawler_p {
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING); final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder); env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
indexUrlMustNotMatch, indexUrlMustNotMatch,
indexContentMustMatch, indexContentMustMatch,
indexContentMustNotMatch, indexContentMustNotMatch,
noindexWhenCanonicalUnequalURL,
newcrawlingdepth, newcrawlingdepth,
directDocByURL, directDocByURL,
crawlingIfOlder, crawlingIfOlder,

@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
false,
CrawlingDepth, CrawlingDepth,
true, true,
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month

@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
return new IndexingQueueEntry(in.queueEntry, in.documents, null); return new IndexingQueueEntry(in.queueEntry, in.documents, null);
} }
} }
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { // check mustmatch pattern
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null); return new IndexingQueueEntry(in.queueEntry, in.documents, null);
} }
// check which files may take part in the indexing process // check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<>(); final List<Document> doclist = new ArrayList<>();
docloop: for (final Document document : in.documents) { docloop: for (final Document document : in.documents) {
// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
}
}
// check indexing denied flags
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) { if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop; continue docloop;
} }
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { // check content pattern must-match
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry // create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop; continue docloop;
} }

Loading…
Cancel
Save