From 31920385f70d8b899d79fdf5b6b22a580020b366 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 16 Sep 2013 16:14:56 +0200 Subject: [PATCH] set anchor rel attribute of all links to "nofollow" if the html meta contains a robots:nofollow or if the http header contains a "X-Robots-Tag: nofollow" --- .../yacy/document/parser/html/ContentScraper.java | 14 +++++++++++++- .../yacy/search/schema/WebgraphConfiguration.java | 7 ++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 702fbd344..00f396bf8 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (newLink != null) { tagopts.put("href", newLink.toNormalform(true)); - final String rel = tagopts.getProperty("rel", EMPTY_STRING); + String rel = tagopts.getProperty("rel", EMPTY_STRING); final String linktitle = tagopts.getProperty("title", EMPTY_STRING); final String type = tagopts.getProperty("type", EMPTY_STRING); final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING); @@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1); this.images.add(ie); } else { + if (followDenied()) { + String rel = tagopts.getProperty("rel", EMPTY_STRING); + if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; + tagopts.put("rel", rel); + } tagopts.put("text", new String(text)); tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute url.setAll(tagopts); @@ -765,6 +770,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (s.indexOf("noindex",0) >= 0) return true; return false; } + + public boolean followDenied() { + final String s = this.metas.get("robots"); + if (s == null) return false; + if (s.indexOf("nofollow",0) >= 0) return true; + return false; + } public List getDescriptions() { String s = this.metas.get("description"); diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index 9b7ab6362..0faa2f780 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial final IndexCell citations) { boolean allAttr = this.isEmpty(); int target_order = 0; + boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0; for (final AnchorURL target_url: links) { Set processTypes = new LinkedHashSet(); final String name = target_url.getNameProperty(); // the name attribute final String text = target_url.getTextProperty(); // the text between the tag - final String rel = target_url.getRelProperty(); // the rel-attribute + String rel = target_url.getRelProperty(); // the rel-attribute int ioidx = inbound ? 0 : 1; + if (generalNofollow) { + // patch the rel attribute since the header makes nofollow valid for all links + if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; + } // index organization StringBuilder idi = new StringBuilder(8);