added canonical filter

attention: this is on by default! (it should do the right thing)
2 years ago · 9fcd8f1bda
parent 5a52b01c09
commit 9fcd8f1bda
12 changed files with 223 additions and 134 deletions
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@ -37,6 +37,7 @@
        <indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
        <indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
        <indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
+        <noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
        <status>#(status)#terminated::active::system#(/status)#</status>
        <crawlingDomFilterContent>
        #{crawlingDomFilterContent}#
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@ -412,6 +412,9 @@
            <table style="border-width: 0px">
            <tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
            <tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
+            <tr>
+              <td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
+            </tr>
            </table>
            </dd>
            <dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
                    <tr>
                        <td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
                        <td>
-                            <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
+                            <input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
                        </td>
                    </tr>
                    #(/embeddedSolrConnected)#
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
        // check if ip is local ip address
        final String urlRejectReason = this.urlInAcceptedDomain(url);
        if (urlRejectReason != null) {
-            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
+            if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
            return "denied_(" + urlRejectReason + ")";
        }

--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
                true,
                CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
                true,
                CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
                true,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
                CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0,
                false,
                null,
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
                CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
                CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
                CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
+                true, //noindexWhenCanonicalUnequalURL
                0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
                true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
                true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
        INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch",    false, CrawlAttribute.STRING,  "Indexing Solr Query Must-Match Filter"),
        INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING,  "Indexing Solr Query Must-Not-Match Filter"),
+        NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING,  "No Indexing for Documents with Canonical != URL"),
        RECRAWL_IF_OLDER             ("recrawlIfOlder",             false, CrawlAttribute.INTEGER, "Recrawl If Older"),
        STORE_HTCACHE                ("storeHTCache",               false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
        CACHE_STRAGEGY               ("cacheStrategy",              false, CrawlAttribute.STRING,  "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
                 final String indexUrlMustMatch, final String indexUrlMustNotMatch,
                 final String indexContentMustMatch, final String indexContentMustNotMatch,
+                 final boolean noindexWhenCanonicalUnequalURL,
                 final int depth,
                 final boolean directDocByURL,
                 final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
        put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
        put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
+        put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
    }

    /**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }

+    public boolean noindexWhenCanonicalUnequalURL() {
+        final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
+        if (r == null) return true;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
    public boolean storeHTCache() {
        final String r = get(CrawlAttribute.STORE_HTCACHE.key);
        if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
+        prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
        prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -217,6 +217,14 @@ public class Document {
        this.scraperObject = scraper;
    }
    
+    public AnchorURL getCanonical() {
+        final Object scraper = this.getScraperObject();
+        if (!(scraper instanceof ContentScraper)) return null;
+        final ContentScraper html = (ContentScraper) scraper;
+        AnchorURL canonical = html.getCanonical();
+        return canonical;
+    }
+
    public Set<String> getContentLanguages() {
        return this.languages;
    }
--- a/source/net/yacy/htroot/CrawlStartExpert.java
+++ b/source/net/yacy/htroot/CrawlStartExpert.java
@ -369,6 +369,13 @@ public class CrawlStartExpert {
            }
        }

+        // Check Canonical?
+        if (post == null) {
+            prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
+        } else {
+            prop.put("noindexWhenCanonicalUnequalURLChecked",
+                    post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
+        }

        // ---------- Clean-Up before Crawl Start
        // delete if older settings: number value
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@ -316,6 +316,7 @@ public class Crawler_p {
                final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
                final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
                final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+                final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));

                final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
                env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
                            indexUrlMustNotMatch,
                            indexContentMustMatch,
                            indexContentMustNotMatch,
+                            noindexWhenCanonicalUnequalURL,
                            newcrawlingdepth,
                            directDocByURL,
                            crawlingIfOlder,
--- a/source/net/yacy/htroot/QuickCrawlLink_p.java
+++ b/source/net/yacy/htroot/QuickCrawlLink_p.java
@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
                        CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
                        CrawlProfile.MATCH_ALL_STRING,   //indexContentMustMatch
                        CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+                        false,
                        CrawlingDepth,
                        true,
                        CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
                return new IndexingQueueEntry(in.queueEntry, in.documents, null);
            }
        }
-        if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
-                (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
-            if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
+
+        // check mustmatch pattern
+        Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
+        if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
+            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
+            if (this.log.isInfo()) this.log.info(info);
+            // create a new errorURL DB entry
+            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
+        }
+
+        // check mustnotmatch
+        Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
+        if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
+            String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
+            if (this.log.isInfo()) this.log.info(info);
            // create a new errorURL DB entry
-            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
+            this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
            return new IndexingQueueEntry(in.queueEntry, in.documents, null);
        }

        // check which files may take part in the indexing process
        final List<Document> doclist = new ArrayList<>();
        docloop: for (final Document document : in.documents) {
+
+            // check canonical
+            if (profile.noindexWhenCanonicalUnequalURL()) {
+                AnchorURL canonical = document.getCanonical();
+                DigestURL source = document.dc_source();
+                if (canonical != null && source != null) {
+                    String canonical_norm = canonical.toNormalform(true);
+                    String source_norm = source.toNormalform(true);
+                    if (!canonical_norm.equals(source_norm)) {
+                        String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
+                        if (this.log.isInfo()) this.log.info(info);
+                        // create a new errorURL DB entry
+                        this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+                        continue docloop;
+                    }
+                }
+            }
+
+            // check indexing denied flags
            if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
                if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
                // create a new errorURL DB entry
                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
                continue docloop;
            }
-            if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
-                    (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
-                if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
+
+            // check content pattern must-match
+            Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
+            if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
+                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
+                if (this.log.isInfo()) this.log.info(info);
+                // create a new errorURL DB entry
+                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+                continue docloop;
+            }
+
+            // check content pattern must-not-match
+            Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
+            if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
+                String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
+                if (this.log.isInfo()) this.log.info(info);
                // create a new errorURL DB entry
-                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
+                this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
                continue docloop;
            }