From fb3dd56b02125f9293fa67eb47e04f64e82d4903 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 10 Jul 2014 17:13:35 +0200 Subject: [PATCH] fix for processing of noindex flag in http header --- .../net/yacy/cora/protocol/ResponseHeader.java | 3 ++- source/net/yacy/document/Document.java | 11 +++++++++-- source/net/yacy/repository/LoaderDispatcher.java | 16 ++++++++++++++-- .../search/schema/CollectionConfiguration.java | 9 +-------- 4 files changed, 26 insertions(+), 13 deletions(-) diff --git a/source/net/yacy/cora/protocol/ResponseHeader.java b/source/net/yacy/cora/protocol/ResponseHeader.java index 92f4fd84b..7c1b3924c 100644 --- a/source/net/yacy/cora/protocol/ResponseHeader.java +++ b/source/net/yacy/cora/protocol/ResponseHeader.java @@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework { if (x_robots_tag.isEmpty()) { x_robots_tag = this.get(HeaderFramework.X_ROBOTS, ""); } - return x_robots_tag; + return x_robots_tag.toLowerCase(); } + } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 040227428..fe3a1b0c2 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -90,7 +90,7 @@ public class Document { private MultiProtocolURL favicon; private boolean resorted; private final Set languages; - private final boolean indexingDenied; + private boolean indexingDenied; private final double lon, lat; private final Object parserObject; // the source object that was used to create the Document private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document @@ -733,6 +733,10 @@ dc_rights return this.indexingDenied; } + public void setIndexingDenied(boolean indexingDenied) { + this.indexingDenied = indexingDenied; + } + public void setDepth(int depth) { this.crawldepth = depth; } @@ -819,6 +823,7 @@ dc_rights final LinkedHashMap images = new LinkedHashMap(); final Set languages = new HashSet(); double lon = 0.0d, lat = 0.0d; + boolean indexingDenied = false; Date date = new Date(); String charset = null; @@ -867,6 +872,8 @@ dc_rights if (doc.getDepth() < mindepth) mindepth = doc.getDepth(); if (doc.dc_language() != null) languages.add(doc.dc_language()); + + indexingDenied |= doc.indexingDenied; } // clean up parser data @@ -898,7 +905,7 @@ dc_rights anchors, rss, images, - false, + indexingDenied, date); newDoc.setDepth(mindepth); return newDoc; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index ca15d13fb..c25cde6d6 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -355,7 +355,14 @@ public final class LoaderDispatcher { if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url); // parse resource - return response.parse(); + Document[] documents = response.parse(); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) { + for (Document d: documents) d.setIndexingDenied(true); + } + + return documents; } public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { @@ -371,7 +378,12 @@ public final class LoaderDispatcher { // parse resource try { Document[] documents = response.parse(); - return Document.mergeDocuments(location, response.getMimeType(), documents); + Document merged = Document.mergeDocuments(location, response.getMimeType(), documents); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true); + + return merged; } catch(final Parser.Failure e) { throw new IOException(e.getMessage()); } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 5dba2e9d6..f000d20da 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4 } - String x_robots_tag = ""; - if (responseHeader != null) { - x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, ""); - if (x_robots_tag.isEmpty()) { - x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, ""); - } - } + String x_robots_tag = responseHeader.getXRobotsTag(); if (!x_robots_tag.isEmpty()) { - x_robots_tag = x_robots_tag.toLowerCase(); // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9