fix for processing of noindex flag in http header

11 years ago · fb3dd56b02
parent b0d941626f
commit fb3dd56b02
4 changed files with 26 additions and 13 deletions
--- a/source/net/yacy/cora/protocol/ResponseHeader.java
+++ b/source/net/yacy/cora/protocol/ResponseHeader.java
@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
        if (x_robots_tag.isEmpty()) {
            x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
        }
-        return x_robots_tag;
+        return x_robots_tag.toLowerCase();
    }
+
 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -90,7 +90,7 @@ public class Document {
    private MultiProtocolURL favicon;
    private boolean resorted;
    private final Set<String> languages;
-    private final boolean indexingDenied;
+    private boolean indexingDenied;
    private final double lon, lat;
    private final Object parserObject; // the source object that was used to create the Document
    private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
@ -733,6 +733,10 @@ dc_rights
        return this.indexingDenied;
    }

+    public void setIndexingDenied(boolean indexingDenied) {
+        this.indexingDenied = indexingDenied;
+    }
+
    public void setDepth(int depth) {
        this.crawldepth = depth;
    }
@ -819,6 +823,7 @@ dc_rights
        final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
        final Set<String> languages = new HashSet<String>();
        double lon = 0.0d, lat = 0.0d;
+        boolean indexingDenied = false;
        Date date = new Date();
        String charset = null;

@ -867,6 +872,8 @@ dc_rights
            
            if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
            if (doc.dc_language() != null) languages.add(doc.dc_language());
+            
+            indexingDenied |= doc.indexingDenied;
        }

        // clean up parser data
@ -898,7 +905,7 @@ dc_rights
                anchors,
                rss,
                images,
-                false,
+                indexingDenied,
                date);
        newDoc.setDepth(mindepth);
        return newDoc;
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -355,7 +355,14 @@ public final class LoaderDispatcher {
        if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);

        // parse resource
-        return response.parse();
+        Document[] documents = response.parse();
+
+        String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+        if (x_robots_tag.indexOf("noindex",0) >= 0) {
+            for (Document d: documents) d.setIndexingDenied(true);
+        }
+        
+        return documents;
    }

    public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -371,7 +378,12 @@ public final class LoaderDispatcher {
        // parse resource
        try {
            Document[] documents = response.parse();
-            return Document.mergeDocuments(location, response.getMimeType(), documents);
+            Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
+            
+            String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+            if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
+            
+            return merged;
        } catch(final Parser.Failure e) {
            throw new IOException(e.getMessage());
        }
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                if (robots_meta.indexOf("noindex",0) >= 0) b += 8;  // set bit 3
                if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
            }
-            String x_robots_tag = "";
-            if (responseHeader != null) {
-                x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
-                if (x_robots_tag.isEmpty()) {
-                    x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
-                }
-            }
+            String x_robots_tag = responseHeader.getXRobotsTag();
            if (!x_robots_tag.isEmpty()) {
-                x_robots_tag = x_robots_tag.toLowerCase();
                // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
                if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8;                // set bit 8
                if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9;   // set bit 9