added robots tag parser to solr scheme

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7986 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent cf4fd525ee
commit 1b45e33f04

@ -60,8 +60,18 @@ attr_scripts
## number of script entries, int
scriptscount_i
## content of <meta name="robots" content=#content#> tag, text
metarobots_t
## encoded as binary value into an integer:
## bit 0: "all" contained in html header meta
## bit 1: "index" contained in html header meta
## bit 2: "noindex" contained in html header meta
## bit 3: "nofollow" contained in html header meta
## bit 8: "noarchive" contained in http header properties
## bit 9: "nosnippet" contained in http header properties
## bit 10: "noindex" contained in http header properties
## bit 11: "nofollow" contained in http header properties
## bit 12: "unavailable_after" contained in http header properties
## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
robots_i
## html status return code (i.e. "200" for ok), -1 if not loaded, int
httpstatus_i

@ -104,6 +104,9 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public static final String VIA = "Via";
public static final String X_FORWARDED_FOR = "X-Forwarded-For";
public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
public static final String X_ROBOTS = "X-Robots";
public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
//public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
@ -235,7 +238,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
this.reverseMappingCache = reverseMappingCache;
// load with data
if (othermap != null) this.putAll(othermap);
if (othermap != null) putAll(othermap);
}
/** Date formatter/parser for standard compliant HTTP header dates (RFC 1123) */
@ -272,7 +275,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
return lastRFC1123string;
}
synchronized (FORMAT_RFC1123) {
String s = FORMAT_RFC1123.format(date);
final String s = FORMAT_RFC1123.format(date);
lastRFC1123long = date.getTime();
lastRFC1123string = s;
return s;
@ -286,7 +289,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
// the year value starting with 1970
CAL_GMT.setTimeInMillis(0);
for (SimpleDateFormat format: FORMATS_HTTP) {
for (final SimpleDateFormat format: FORMATS_HTTP) {
format.setTimeZone(TZ_GMT);
format.set2DigitYearStart(CAL_GMT.getTime());
}
@ -300,7 +303,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public static Date parseHTTPDate(String s) {
s = s.trim();
if (s == null || s.length() < 9) return null;
for (SimpleDateFormat format: FORMATS_HTTP) synchronized (format) {
for (final SimpleDateFormat format: FORMATS_HTTP) synchronized (format) {
try { return format.parse(s); } catch (final ParseException e) {}
}
return null;
@ -311,18 +314,18 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public String put(final String key, final String value) {
final String upperK = key.toUpperCase();
if (reverseMappingCache == null) {
if (this.reverseMappingCache == null) {
return super.put(key, value);
}
if (reverseMappingCache.containsKey(upperK)) {
if (this.reverseMappingCache.containsKey(upperK)) {
// we put in the value using the reverse mapping
return super.put(reverseMappingCache.get(upperK), value);
return super.put(this.reverseMappingCache.get(upperK), value);
}
// we put in without a cached key and store the key afterwards
final String r = super.put(key, value);
reverseMappingCache.put(upperK, key);
this.reverseMappingCache.put(upperK, key);
return r;
}
@ -336,7 +339,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public int keyCount(final String key) {
if (!(containsKey(key))) return 0;
int c = 1;
String h = "*" + key + "-";
final String h = "*" + key + "-";
while (containsKey(h + Integer.toString(c))) c++;
return c;
}
@ -366,7 +369,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
FileOutputStream fos = null;
try {
fos = new FileOutputStream(f);
for (java.util.Map.Entry<String, String> entry: entrySet()) {
for (final java.util.Map.Entry<String, String> entry: entrySet()) {
fos.write(UTF8.getBytes(entry.getKey() + "=" + entry.getValue() + "\r\n"));
}
fos.flush();
@ -604,10 +607,10 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
this.v = v;
}
public String getKey() {
return k;
return this.k;
}
public String getValue() {
return v;
return this.v;
}
public String setValue(final String v) {
final String r = this.v;
@ -639,7 +642,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
if (path != null) cookieString += " path=" + path + ";";
if (domain != null) cookieString += " domain=" + domain + ";";
if (secure) cookieString += " secure;";
headerProps.add(new Entry("Set-Cookie", cookieString));
this.headerProps.add(new Entry("Set-Cookie", cookieString));
}
/**
* Sets Cookie on the client machine.
@ -700,7 +703,7 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
setCookie( name, value, null, null, null, false);
}
public String getHeaderCookies(){
final Iterator<Map.Entry<String, String>> it = this.entrySet().iterator();
final Iterator<Map.Entry<String, String>> it = entrySet().iterator();
while(it.hasNext())
{
final Map.Entry<String, String> e = it.next();
@ -714,15 +717,15 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
}
public void addHeader(final String key, final String value) {
headerProps.add(new Entry(key, value));
this.headerProps.add(new Entry(key, value));
}
public Vector<Entry> getAdditionalHeaderProperties() {
return headerProps;
return this.headerProps;
}
public void setAdditionalHeaderProperties(final Vector<Entry> mycookies){
headerProps=mycookies;
this.headerProps=mycookies;
}
/*

@ -229,11 +229,42 @@ public class SolrScheme extends ConfigurationSet {
// canonical tag
if (html.getCanonical() != null) addSolr(solrdoc, "canonical_s", html.getCanonical().toNormalform(false, false));
// meta tags
final Map<String, String> metas = html.getMetas();
final String robots = metas.get("robots");
if (robots != null) addSolr(solrdoc, "metarobots_t", robots);
final String generator = metas.get("generator");
// noindex and nofollow attributes
// from HTML (meta-tag in HTML header: robots)
// and HTTP header (x-robots property)
// coded as binary value:
// bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta
// bit 3: "nofollow" contained in html header meta
// bit 8: "noarchive" contained in http header properties
// bit 9: "nosnippet" contained in http header properties
// bit 10: "noindex" contained in http header properties
// bit 11: "nofollow" contained in http header properties
// bit 12: "unavailable_after" contained in http header properties
int b = 0;
final String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow
if (robots_meta != null) {
if (robots_meta.indexOf("all") >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index") == 0 || robots_meta.indexOf(" index") >= 0 || robots_meta.indexOf(",index") >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("noindex") >= 0) b += 4; // set bit 2
if (robots_meta.indexOf("nofollow") >= 0) b += 8; // set bit 3
}
String x_robots_tag = header.get(HeaderFramework.X_ROBOTS_TAG, "");
if (x_robots_tag.length() == 0) x_robots_tag = header.get(HeaderFramework.X_ROBOTS, "");
// this tag may have values: noarchive, nosnippet, noindex, unavailable_after
if (x_robots_tag.length() > 0) {
if (x_robots_tag.indexOf("noarchive") >= 0) b += 256; // set bit 8
if (x_robots_tag.indexOf("nosnippet") >= 0) b += 512; // set bit 9
if (x_robots_tag.indexOf("noindex") >= 0) b += 1024; // set bit 10
if (x_robots_tag.indexOf("nofollow") >= 0) b += 2048; // set bit 11
if (x_robots_tag.indexOf("unavailable_after") >=0) b += 4096; // set bit 12
}
addSolr(solrdoc, "robots_i", b);
// meta tags: generator
final String generator = html.getMetas().get("generator");
if (generator != null) addSolr(solrdoc, "metagenerator_t", generator);
// bold, italic
@ -353,6 +384,13 @@ public class SolrScheme extends ConfigurationSet {
return solrdoc;
}
/**
* encode a string containing attributes from anchor rel properties binary:
* bit 0: "me" contained in rel
* bit 1: "nofollow" contained in rel
* @param rel
* @return binary encoded information about rel
*/
private int relEval(final String[] rel) {
int i = 0;
for (final String s: rel) {

Loading…
Cancel
Save