set anchor rel attribute of all links to "nofollow" if the html meta

contains a robots:nofollow or if the http header contains a
"X-Robots-Tag: nofollow"
pull/1/head
Michael Peter Christen 11 years ago
parent 57e00baf26
commit 31920385f7

@ -390,7 +390,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (newLink != null) { if (newLink != null) {
tagopts.put("href", newLink.toNormalform(true)); tagopts.put("href", newLink.toNormalform(true));
final String rel = tagopts.getProperty("rel", EMPTY_STRING); String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING); final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING); final String type = tagopts.getProperty("type", EMPTY_STRING);
final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING); final String hreflang = tagopts.getProperty("hreflang", EMPTY_STRING);
@ -475,6 +475,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1); final ImageEntry ie = new ImageEntry(url, recursiveParse(url, text), -1, -1, -1);
this.images.add(ie); this.images.add(ie);
} else { } else {
if (followDenied()) {
String rel = tagopts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tagopts.put("rel", rel);
}
tagopts.put("text", new String(text)); tagopts.put("text", new String(text));
tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute tagopts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tagopts); url.setAll(tagopts);
@ -766,6 +771,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false; return false;
} }
public boolean followDenied() {
final String s = this.metas.get("robots");
if (s == null) return false;
if (s.indexOf("nofollow",0) >= 0) return true;
return false;
}
public List<String> getDescriptions() { public List<String> getDescriptions() {
String s = this.metas.get("description"); String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description"); if (s == null) s = this.metas.get("dc.description");

@ -120,14 +120,19 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
final IndexCell<CitationReference> citations) { final IndexCell<CitationReference> citations) {
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
int target_order = 0; int target_order = 0;
boolean generalNofollow = responseHeader.get("X-Robots-Tag", "").indexOf("nofollow") >= 0;
for (final AnchorURL target_url: links) { for (final AnchorURL target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>(); Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
final String name = target_url.getNameProperty(); // the name attribute final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag final String text = target_url.getTextProperty(); // the text between the <a></a> tag
final String rel = target_url.getRelProperty(); // the rel-attribute String rel = target_url.getRelProperty(); // the rel-attribute
int ioidx = inbound ? 0 : 1; int ioidx = inbound ? 0 : 1;
if (generalNofollow) {
// patch the rel attribute since the header makes nofollow valid for all links
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
}
// index organization // index organization
StringBuilder idi = new StringBuilder(8); StringBuilder idi = new StringBuilder(8);

Loading…
Cancel
Save