added canonical filter

attention: this is on by default!
(it should do the right thing)
pull/554/head
Michael Peter Christen 2 years ago
parent 5a52b01c09
commit 9fcd8f1bda

@ -37,6 +37,7 @@
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#

@ -412,6 +412,9 @@
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
<tr>
<td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
</tr>
</table>
</dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
</td>
</tr>
#(/embeddedSolrConnected)#

@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
// check if ip is local ip address
final String urlRejectReason = this.urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}

@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
null,

@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,

@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch,
final boolean noindexWhenCanonicalUnequalURL,
final int depth,
final boolean directDocByURL,
final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
}
/**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public boolean noindexWhenCanonicalUnequalURL() {
final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));

@ -217,6 +217,14 @@ public class Document {
this.scraperObject = scraper;
}
public AnchorURL getCanonical() {
final Object scraper = this.getScraperObject();
if (!(scraper instanceof ContentScraper)) return null;
final ContentScraper html = (ContentScraper) scraper;
AnchorURL canonical = html.getCanonical();
return canonical;
}
public Set<String> getContentLanguages() {
return this.languages;
}

@ -369,6 +369,13 @@ public class CrawlStartExpert {
}
}
// Check Canonical?
if (post == null) {
prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
} else {
prop.put("noindexWhenCanonicalUnequalURLChecked",
post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
}
// ---------- Clean-Up before Crawl Start
// delete if older settings: number value

@ -316,6 +316,7 @@ public class Crawler_p {
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
indexUrlMustNotMatch,
indexContentMustMatch,
indexContentMustNotMatch,
noindexWhenCanonicalUnequalURL,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,

@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
false,
CrawlingDepth,
true,
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month

@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
}
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
// check mustmatch pattern
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<>();
docloop: for (final Document document : in.documents) {
// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
}
}
// check indexing denied flags
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop;
}
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
// check content pattern must-match
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}

Loading…
Cancel
Save