set anchor rel attribute of all links to "nofollow" if the html meta

contains a robots:nofollow or if the http header contains a "X-Robots-Tag: nofollow"
11 years ago · 31920385f7
parent 57e00baf26
commit 31920385f7
2 changed files with 19 additions and 2 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {

            if (newLink != null) {
                tagopts.put("href", newLink.toNormalform(true));
-                final String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                String rel = tagopts.getProperty("rel", EMPTY_STRING);
                final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
                final String type = tagopts.getProperty("type", EMPTY_STRING);
                final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
                    this.images.add(ie);
                } else {
+                    if (followDenied()) {
+                        String rel = tagopts.getProperty("rel", EMPTY_STRING);
+                        if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
+                        tagopts.put("rel", rel);
+                    }
                    tagopts.put("text", new String(text));
                    tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                    url.setAll(tagopts);
@ -765,6 +770,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        if (s.indexOf("noindex",0) >= 0) return true;
        return false;
    }
+    
+    public boolean followDenied() {
+        final String s = this.metas.get("robots");
+        if (s == null) return false;
+        if (s.indexOf("nofollow",0) >= 0) return true;
+        return false;
+    }

    public List<String> getDescriptions() {
        String s = this.metas.get("description");
--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
            final IndexCell<CitationReference> citations) {
        boolean allAttr = this.isEmpty();
        int target_order = 0;
+        boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
        for (final AnchorURL target_url: links) {

            Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
            
            final String name = target_url.getNameProperty(); // the name attribute
            final String text = target_url.getTextProperty(); // the text between the <a></a> tag
-            final String rel = target_url.getRelProperty();   // the rel-attribute
+            String rel = target_url.getRelProperty();         // the rel-attribute
            int ioidx = inbound ? 0 : 1;
+            if (generalNofollow) {
+                // patch the rel attribute since the header makes nofollow valid for all links
+                if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
+            }
            
            // index organization
            StringBuilder idi = new StringBuilder(8);