From 31920385f70d8b899d79fdf5b6b22a580020b366 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Mon, 16 Sep 2013 16:14:56 +0200
Subject: [PATCH] set anchor rel attribute of all links to "nofollow" if the
 html meta contains a robots:nofollow or if the http header contains a
 "X-Robots-Tag: nofollow"

---
 .../yacy/document/parser/html/ContentScraper.java  | 14 +++++++++++++-
 .../yacy/search/schema/WebgraphConfiguration.java  |  7 ++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 702fbd344..00f396bf8 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
 
             if (newLink != null) {
                 tagopts.put("href", newLink.toNormalform(true));
-                final String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                String rel = tagopts.getProperty("rel", EMPTY_STRING);
                 final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
                 final String type = tagopts.getProperty("type", EMPTY_STRING);
                 final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
@@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                     final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
                     this.images.add(ie);
                 } else {
+                    if (followDenied()) {
+                        String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                        if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
+                        tagopts.put("rel", rel);
+                    }
                     tagopts.put("text", new String(text));
                     tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                     url.setAll(tagopts);
@@ -765,6 +770,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         if (s.indexOf("noindex",0) >= 0) return true;
         return false;
     }
+    
+    public boolean followDenied() {
+        final String s = this.metas.get("robots");
+        if (s == null) return false;
+        if (s.indexOf("nofollow",0) >= 0) return true;
+        return false;
+    }
 
     public List<String> getDescriptions() {
         String s = this.metas.get("description");
diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java
index 9b7ab6362..0faa2f780 100644
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
             final IndexCell<CitationReference> citations) {
         boolean allAttr = this.isEmpty();
         int target_order = 0;
+        boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
         for (final AnchorURL target_url: links) {
 
             Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
             
             final String name = target_url.getNameProperty(); // the name attribute
             final String text = target_url.getTextProperty(); // the text between the <a></a> tag
-            final String rel = target_url.getRelProperty();   // the rel-attribute
+            String rel = target_url.getRelProperty();         // the rel-attribute
             int ioidx = inbound ? 0 : 1;
+            if (generalNofollow) {
+                // patch the rel attribute since the header makes nofollow valid for all links
+                if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
+            }
             
             // index organization
             StringBuilder idi = new StringBuilder(8);