fix for processing of noindex flag in http header

pull/1/head
Michael Peter Christen 11 years ago
parent b0d941626f
commit fb3dd56b02

@ -108,6 +108,7 @@ public class ResponseHeader extends HeaderFramework {
if (x_robots_tag.isEmpty()) {
x_robots_tag = this.get(HeaderFramework.X_ROBOTS, "");
}
return x_robots_tag;
return x_robots_tag.toLowerCase();
}
}

@ -90,7 +90,7 @@ public class Document {
private MultiProtocolURL favicon;
private boolean resorted;
private final Set<String> languages;
private final boolean indexingDenied;
private boolean indexingDenied;
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
@ -733,6 +733,10 @@ dc_rights
return this.indexingDenied;
}
public void setIndexingDenied(boolean indexingDenied) {
this.indexingDenied = indexingDenied;
}
public void setDepth(int depth) {
this.crawldepth = depth;
}
@ -819,6 +823,7 @@ dc_rights
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>();
double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false;
Date date = new Date();
String charset = null;
@ -867,6 +872,8 @@ dc_rights
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
if (doc.dc_language() != null) languages.add(doc.dc_language());
indexingDenied |= doc.indexingDenied;
}
// clean up parser data
@ -898,7 +905,7 @@ dc_rights
anchors,
rss,
images,
false,
indexingDenied,
date);
newDoc.setDepth(mindepth);
return newDoc;

@ -355,7 +355,14 @@ public final class LoaderDispatcher {
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource
return response.parse();
Document[] documents = response.parse();
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) {
for (Document d: documents) d.setIndexingDenied(true);
}
return documents;
}
public Document loadDocument(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
@ -371,7 +378,12 @@ public final class LoaderDispatcher {
// parse resource
try {
Document[] documents = response.parse();
return Document.mergeDocuments(location, response.getMimeType(), documents);
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) merged.setIndexingDenied(true);
return merged;
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
}

@ -570,15 +570,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
}
String x_robots_tag = "";
if (responseHeader != null) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.isEmpty()) {
x_robots_tag = responseHeader.get(HeaderFramework.X_ROBOTS, "");
}
}
String x_robots_tag = responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) {
x_robots_tag = x_robots_tag.toLowerCase();
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9

Loading…
Cancel
Save