Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 10 years ago
commit 6ee5b4352d

@ -201,6 +201,7 @@ public final class Cache {
public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException { public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
if (maxCacheSize == 0) return; if (maxCacheSize == 0) return;
if (responseHeader.getXRobotsTag().contains("noarchive")) return; // don't cache, see http://noarchive.net/
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null"); if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null"); if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes"); log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");

@ -557,6 +557,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// bit 2: "follow" contained in html header meta // bit 2: "follow" contained in html header meta
// bit 3: "noindex" contained in html header meta // bit 3: "noindex" contained in html header meta
// bit 4: "nofollow" contained in html header meta // bit 4: "nofollow" contained in html header meta
// bit 5: "noarchive" contained in html header meta
// bit 8: "all" contained in http header X-Robots-Tag // bit 8: "all" contained in http header X-Robots-Tag
// bit 9: "noindex" contained in http header X-Robots-Tag // bit 9: "noindex" contained in http header X-Robots-Tag
// bit 10: "nofollow" contained in http header X-Robots-Tag // bit 10: "nofollow" contained in http header X-Robots-Tag
@ -576,6 +577,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3 if (robots_meta.indexOf("noindex",0) >= 0) b += 8; // set bit 3
if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4 if (robots_meta.indexOf("nofollow",0) >= 0) b += 16; // set bit 4
if (robots_meta.indexOf("noarchive",0) >= 0) b += 32; // set bit 5
} }
String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag(); String x_robots_tag = responseHeader == null ? "" : responseHeader.getXRobotsTag();
if (!x_robots_tag.isEmpty()) { if (!x_robots_tag.isEmpty()) {
@ -1494,10 +1496,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try { try {
String doccountquery = String doccountquery =
CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " +
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":8 AND " + // bit 3 (noindex)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":24 AND " + // bit 3 + 4 (noindex + nofollow)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":512 AND " + // bit 9 (noindex)
"-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 "-" + CollectionSchema.robots_i.getSolrFieldName() + ":1536 AND " + // bit 9 + 10 (noindex + nofollow)
"((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " + "((-" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":" + AbstractSolrConnector.CATCHALL_TERM + ") OR (" + CollectionSchema.canonical_equal_sku_b.getSolrFieldName() + ":true)) AND " +
CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200 AND " +
"-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + "-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " +

@ -111,15 +111,21 @@ public enum CollectionSchema implements SchemaDeclaration {
scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"), scripts_sxt(SolrType.string, true, true, true, false, false, "normalized urls within a scripts tag"),
scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"), scriptscount_i(SolrType.num_integer, true, true, false, false, false, "number of entries in scripts_sxt"),
// encoded as binary value into an integer: // encoded as binary value into an integer:
// bit 0: "all" contained in html header meta // bit 0: "all" contained in html header meta
// bit 1: "index" contained in html header meta // bit 1: "index" contained in html header meta
// bit 2: "noindex" contained in html header meta // bit 2: "follow" contained in html header meta
// bit 3: "nofollow" contained in html header meta // bit 3: "noindex" contained in html header meta
// bit 8: "noarchive" contained in http header properties // bit 4: "nofollow" contained in html header meta
// bit 9: "nosnippet" contained in http header properties // bit 5: "noarchive" contained in html header meta
// bit 10: "noindex" contained in http header properties // bit 8: "all" contained in http header X-Robots-Tag
// bit 11: "nofollow" contained in http header properties // bit 9: "noindex" contained in http header X-Robots-Tag
// bit 12: "unavailable_after" contained in http header properties // bit 10: "nofollow" contained in http header X-Robots-Tag
// bit 11: "noarchive" contained in http header X-Robots-Tag
// bit 12: "nosnippet" contained in http header X-Robots-Tag
// bit 13: "noodp" contained in http header X-Robots-Tag
// bit 14: "notranslate" contained in http header X-Robots-Tag
// bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag
robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"), robots_i(SolrType.num_integer, true, true, false, false, false, "content of <meta name=\"robots\" content=#content#> tag and the \"X-Robots-Tag\" HTTP property"),
metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"), metagenerator_t(SolrType.text_general, true, true, false, false, false, "content of <meta name=\"generator\" content=#content#> tag"),
inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"), inboundlinks_protocol_sxt(SolrType.string, true, true, true, false, false, "internal links, only the protocol"),

Loading…
Cancel
Save