From 9fcd8f1bdac38beb18e00e7ed37c6532ea5bd416 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 16 Jan 2023 14:50:30 +0100 Subject: [PATCH] added canonical filter attention: this is on by default! (it should do the right thing) --- htroot/CrawlProfileEditor_p.xml | 87 ++++++------ htroot/CrawlStartExpert.html | 5 +- source/net/yacy/crawler/CrawlStacker.java | 2 +- source/net/yacy/crawler/CrawlSwitchboard.java | 11 ++ .../net/yacy/crawler/RecrawlBusyThread.java | 1 + .../net/yacy/crawler/data/CrawlProfile.java | 10 ++ .../net/yacy/crawler/retrieval/Response.java | 36 ++--- source/net/yacy/document/Document.java | 134 ++++++++++-------- source/net/yacy/htroot/CrawlStartExpert.java | 7 + source/net/yacy/htroot/Crawler_p.java | 2 + source/net/yacy/htroot/QuickCrawlLink_p.java | 1 + source/net/yacy/search/Switchboard.java | 61 ++++++-- 12 files changed, 223 insertions(+), 134 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 0b880ac3f..58036b48e 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -1,48 +1,49 @@ #{crawlProfiles}# - - #[handle]# - #[name]# - #[collections]# - #[agentName]# - #[userAgent]# - #[depth]# - #(directDocByURL)#false::true#(/directDocByURL)# - #[recrawlIfOlder]# - #[domMaxPages]# - #(crawlingQ)#false::true#(/crawlingQ)# - #(followFrames)#false::true#(/followFrames)# - #(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)# - #(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)# - #(indexText)#false::true#(/indexText)# - #(indexMedia)#false::true#(/indexMedia)# - #(storeHTCache)#false::true#(/storeHTCache)# - #(remoteIndexing)#false::true#(/remoteIndexing)# - #[cacheStrategy]# - #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)# - #[crawlerURLMustMatch]# - #[crawlerURLMustNotMatch]# - #[crawlerOriginURLMustMatch]# - #[crawlerOriginURLMustNotMatch]# - #[crawlerIPMustMatch]# - #[crawlerIPMustNotMatch]# - #[crawlerCountryMustMatch]# - #[crawlerNoLimitURLMustMatch]# - #[indexURLMustMatch]# - #[indexURLMustNotMatch]# - #[indexContentMustMatch]# - #[indexContentMustNotMatch]# - #[indexMediaTypeMustMatch]# - #[indexMediaTypeMustNotMatch]# - #[indexSolrQueryMustMatch]# - #[indexSolrQueryMustNotMatch]# - #(status)#terminated::active::system#(/status)# - - #{crawlingDomFilterContent}# - #[item]# - #{/crawlingDomFilterContent}# - - + + #[handle]# + #[name]# + #[collections]# + #[agentName]# + #[userAgent]# + #[depth]# + #(directDocByURL)#false::true#(/directDocByURL)# + #[recrawlIfOlder]# + #[domMaxPages]# + #(crawlingQ)#false::true#(/crawlingQ)# + #(followFrames)#false::true#(/followFrames)# + #(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)# + #(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)# + #(indexText)#false::true#(/indexText)# + #(indexMedia)#false::true#(/indexMedia)# + #(storeHTCache)#false::true#(/storeHTCache)# + #(remoteIndexing)#false::true#(/remoteIndexing)# + #[cacheStrategy]# + #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)# + #[crawlerURLMustMatch]# + #[crawlerURLMustNotMatch]# + #[crawlerOriginURLMustMatch]# + #[crawlerOriginURLMustNotMatch]# + #[crawlerIPMustMatch]# + #[crawlerIPMustNotMatch]# + #[crawlerCountryMustMatch]# + #[crawlerNoLimitURLMustMatch]# + #[indexURLMustMatch]# + #[indexURLMustNotMatch]# + #[indexContentMustMatch]# + #[indexContentMustNotMatch]# + #[indexMediaTypeMustMatch]# + #[indexMediaTypeMustNotMatch]# + #[indexSolrQueryMustMatch]# + #[indexSolrQueryMustNotMatch]# + #(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)# + #(status)#terminated::active::system#(/status)# + + #{crawlingDomFilterContent}# + #[item]# + #{/crawlingDomFilterContent}# + + #{/crawlProfiles}# diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index c3e60c1b8..9917cd7e1 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -412,6 +412,9 @@ + + +
must-match (must not be empty)
must-not-match
No Indexing when Canonical present and Canonical != URL
Filter on Content of Document
(all visible text, including camel-case-tokenized url and title)
@@ -470,7 +473,7 @@ must-not-match - + #(/embeddedSolrConnected)# diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 16ffea78b..1b4874a96 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask{ // check if ip is local ip address final String urlRejectReason = this.urlInAcceptedDomain(url); if (urlRejectReason != null) { - if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")"); + if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")"); return "denied_(" + urlRejectReason + ")"; } diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index aca7b95cd..3c17c75cf 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -294,6 +294,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")), true, CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), @@ -328,6 +329,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")), true, CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440), @@ -362,6 +364,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")), true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), @@ -395,6 +398,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, null, @@ -428,6 +432,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), @@ -461,6 +466,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), @@ -502,6 +508,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), @@ -535,6 +542,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), @@ -568,6 +576,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), @@ -601,6 +610,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), @@ -637,6 +647,7 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, null, diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java index af32db0cf..11b3d3a98 100644 --- a/source/net/yacy/crawler/RecrawlBusyThread.java +++ b/source/net/yacy/crawler/RecrawlBusyThread.java @@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread { CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch + true, //noindexWhenCanonicalUnequalURL 0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1, true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 19f36b613..64af0c489 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"), INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"), INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"), + NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"), RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"), STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"), CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"), @@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch, final String indexUrlMustMatch, final String indexUrlMustNotMatch, final String indexContentMustMatch, final String indexContentMustNotMatch, + final boolean noindexWhenCanonicalUnequalURL, final int depth, final boolean directDocByURL, final Date recrawlIfOlder /*date*/, @@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY); + put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL); } /** @@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } + public boolean noindexWhenCanonicalUnequalURL() { + final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key); + if (r == null) return true; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean storeHTCache() { final String r = get(CrawlAttribute.STORE_HTCACHE.key); if (r == null) return false; @@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key)); + prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)); diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index acf7c6fd0..567685510 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -175,10 +175,10 @@ public class Response { int p = mime.indexOf('/'); if (p < 0) return new String[]{mime}; if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)}; - if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)}; - if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)}; - if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)}; - return new String[]{mime}; + if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)}; + if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)}; + if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)}; + return new String[]{mime}; } public static final int QUEUE_STATE_FRESH = 0; @@ -235,16 +235,16 @@ public class Response { * @return the original request that produced this response */ public Request getRequest() { - return request; - } + return request; + } public ResponseHeader getResponseHeader() { return this.responseHeader; } public RequestHeader getRequestHeader() { - return this.requestHeader; - } + return this.requestHeader; + } public boolean fromCache() { return this.fromCache; @@ -260,11 +260,11 @@ public class Response { return this.request.name(); } - /** - * @return the requested URL that produced this response. When redirection(s) - * occurred, this is not the initial URL, but the last redirection - * target. - */ + /** + * @return the requested URL that produced this response. When redirection(s) + * occurred, this is not the initial URL, but the last redirection + * target. + */ public DigestURL url() { return this.request.url(); } @@ -745,11 +745,11 @@ public class Response { // -ranges in request // we checked that in shallStoreCache - /* - * Eventually check if a parser supports the media yype. Depending on the crawl - * profile, the indexingDocumentProcessor can eventually index only URL metadata - * using the generic parser for unsupported media types - */ + /* + * Eventually check if a parser supports the media yype. Depending on the crawl + * profile, the indexingDocumentProcessor can eventually index only URL metadata + * using the generic parser for unsupported media types + */ if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) { final String mimeType = this.responseHeader.getContentType(); final String parserError = TextParser.supportsMime(mimeType); diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index ba3365d5c..34676836a 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -91,12 +91,12 @@ public class Document { /** links to icons that belongs to the document (mapped by absolute URL) */ private Map icons; - /** - * URLs of linked data item types/classes referenced by the document (for example in - * HTML with standard annotations such as RDFa, microdata, microformats or - * JSON-LD) - */ - private Set linkedDataTypes; + /** + * URLs of linked data item types/classes referenced by the document (for example in + * HTML with standard annotations such as RDFa, microdata, microformats or + * JSON-LD) + */ + private Set linkedDataTypes; private boolean resorted; private final Set languages; private boolean indexingDenied; @@ -131,13 +131,13 @@ public class Document { this.parserObject = parserObject; this.keywords = new LinkedHashSet(); if (keywords != null) { - Collections.addAll(this.keywords, keywords); + Collections.addAll(this.keywords, keywords); } this.titles = (titles == null) ? new ArrayList(1) : titles; this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author); this.sections = new LinkedList() ; if (sections != null) { - Collections.addAll(this.sections, sections); + Collections.addAll(this.sections, sections); } this.descriptions = (abstrcts == null) ? new ArrayList() : abstrcts; if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) { @@ -216,13 +216,21 @@ public class Document { } this.scraperObject = scraper; } + + public AnchorURL getCanonical() { + final Object scraper = this.getScraperObject(); + if (!(scraper instanceof ContentScraper)) return null; + final ContentScraper html = (ContentScraper) scraper; + AnchorURL canonical = html.getCanonical(); + return canonical; + } public Set getContentLanguages() { return this.languages; } public String getFileName() { - return this.source.getFileName(); + return this.source.getFileName(); } public Map> getGenericFacets() { @@ -233,15 +241,15 @@ public class Document { * @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */ public boolean isPartiallyParsed() { - return this.partiallyParsed; - } + return this.partiallyParsed; + } /** * @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */ public void setPartiallyParsed(final boolean partiallyParsed) { - this.partiallyParsed = partiallyParsed; - } + this.partiallyParsed = partiallyParsed; + } /** * compute a set of languages that this document contains @@ -637,13 +645,13 @@ dc_rights // we add artificial hyperlinks to the hyperlink set // that can be calculated from given hyperlinks and imagelinks - /* - * Should we also include icons ? with - * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is - * problematic as allReflinks will modify icons set set, removing those whose URL is - * starting with "/www" but it is not desired for icons such as - * www.wikipedia.org/static/favicon/wikipedia.ico - */ + /* + * Should we also include icons ? with + * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is + * problematic as allReflinks will modify icons set set, removing those whose URL is + * starting with "/www" but it is not desired for icons such as + * www.wikipedia.org/static/favicon/wikipedia.ico + */ this.hyperlinks.putAll(allReflinks(this.images.values())); this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet())); @@ -804,16 +812,16 @@ dc_rights } InputStream textStream = doc.getTextStream(); try { - FileUtils.copy(textStream, (ByteArrayOutputStream) this.text); + FileUtils.copy(textStream, (ByteArrayOutputStream) this.text); } finally { - try { - if(textStream != null) { - /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ - textStream.close(); - } - } catch(IOException e) { - ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); - } + try { + if(textStream != null) { + /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ + textStream.close(); + } + } catch(IOException e) { + ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); + } } this.anchors.addAll(doc.getAnchors()); @@ -826,41 +834,41 @@ dc_rights * @return links to icons that belongs to the document (mapped by absolute URL) */ public Map getIcons() { - return icons; - } + return icons; + } /** * Set links to icons that belongs to the document (mapped by absolute URL) * @param icons */ public void setIcons(final Map icons) { - /* Better to ensure now icons property will not be null */ - if(icons != null) { - this.icons = icons; - } else { - this.icons = new HashMap<>(); - } - } + /* Better to ensure now icons property will not be null */ + if(icons != null) { + this.icons = icons; + } else { + this.icons = new HashMap<>(); + } + } - /** - * @return URLs of linked data item types/classes referenced by the document (for example in - * HTML with standard annotations such as RDFa, microdata, microformats or - * JSON-LD) - */ + /** + * @return URLs of linked data item types/classes referenced by the document (for example in + * HTML with standard annotations such as RDFa, microdata, microformats or + * JSON-LD) + */ public Set getLinkedDataTypes() { - return this.linkedDataTypes; - } + return this.linkedDataTypes; + } - /** - * @return URLs of linked data item types/classes referenced by the document - */ + /** + * @return URLs of linked data item types/classes referenced by the document + */ public void setLinkedDataTypes(final Set linkedDataTypes) { - if(linkedDataTypes != null) { - /* Ensure non null property */ - this.linkedDataTypes = linkedDataTypes; - } else { - this.linkedDataTypes.clear(); - } + if(linkedDataTypes != null) { + /* Ensure non null property */ + this.linkedDataTypes = linkedDataTypes; + } else { + this.linkedDataTypes.clear(); + } } @@ -1034,14 +1042,14 @@ dc_rights } catch (final IOException e) { ConcurrentLog.logException(e); } finally { - try { - if(textStream != null) { - /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ - textStream.close(); - } - } catch (IOException e) { - ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); - } + try { + if(textStream != null) { + /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ + textStream.close(); + } + } catch (IOException e) { + ConcurrentLog.warn("DOCUMENT", "Could not close text input stream"); + } } } anchors.addAll(doc.getAnchors()); @@ -1098,7 +1106,7 @@ dc_rights public final static String IFRAME_MARKER = "iframe"; public final static String FRAME_MARKER = "frame"; public final static String EMBED_MARKER = "embed"; - + public static Map getHyperlinks(final Document[] documents, boolean includeNofollow) { final Map result = new HashMap<>(); for (final Document d: documents) { diff --git a/source/net/yacy/htroot/CrawlStartExpert.java b/source/net/yacy/htroot/CrawlStartExpert.java index 9c8d55149..56b17d767 100644 --- a/source/net/yacy/htroot/CrawlStartExpert.java +++ b/source/net/yacy/htroot/CrawlStartExpert.java @@ -369,6 +369,13 @@ public class CrawlStartExpert { } } + // Check Canonical? + if (post == null) { + prop.put("noindexWhenCanonicalUnequalURLChecked", 1); + } else { + prop.put("noindexWhenCanonicalUnequalURLChecked", + post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0); + } // ---------- Clean-Up before Crawl Start // delete if older settings: number value diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java index 98ca68dc1..d6b19e883 100644 --- a/source/net/yacy/htroot/Crawler_p.java +++ b/source/net/yacy/htroot/Crawler_p.java @@ -316,6 +316,7 @@ public class Crawler_p { final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING); final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off")); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); env.setConfig("crawlOrder", crawlOrder); @@ -614,6 +615,7 @@ public class Crawler_p { indexUrlMustNotMatch, indexContentMustMatch, indexContentMustNotMatch, + noindexWhenCanonicalUnequalURL, newcrawlingdepth, directDocByURL, crawlingIfOlder, diff --git a/source/net/yacy/htroot/QuickCrawlLink_p.java b/source/net/yacy/htroot/QuickCrawlLink_p.java index 4230b69ea..6c2f432f4 100644 --- a/source/net/yacy/htroot/QuickCrawlLink_p.java +++ b/source/net/yacy/htroot/QuickCrawlLink_p.java @@ -150,6 +150,7 @@ public class QuickCrawlLink_p { CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch + false, CrawlingDepth, true, CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 34ebfcc94..50902296e 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch { return new IndexingQueueEntry(in.queueEntry, in.documents, null); } } - if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) || - (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { - if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); + + // check mustmatch pattern + Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); + if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) { + String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); + if (this.log.isInfo()) this.log.info(info); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); + return new IndexingQueueEntry(in.queueEntry, in.documents, null); + } + + // check mustnotmatch + Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); + if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) { + String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; + if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1); + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } // check which files may take part in the indexing process final List doclist = new ArrayList<>(); docloop: for (final Document document : in.documents) { + + // check canonical + if (profile.noindexWhenCanonicalUnequalURL()) { + AnchorURL canonical = document.getCanonical(); + DigestURL source = document.dc_source(); + if (canonical != null && source != null) { + String canonical_norm = canonical.toNormalform(true); + String source_norm = source.toNormalform(true); + if (!canonical_norm.equals(source_norm)) { + String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; + if (this.log.isInfo()) this.log.info(info); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); + continue docloop; + } + } + } + + // check indexing denied flags if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) { if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1); continue docloop; } - if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) || - (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { - if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); + + // check content pattern must-match + Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); + if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) { + String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; + if (this.log.isInfo()) this.log.info(info); + // create a new errorURL DB entry + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); + continue docloop; + } + + // check content pattern must-not-match + Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); + if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) { + String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); + if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry - this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1); + this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); continue docloop; }