From b0d941626fb15caadbf20abc70fe46d31ee5d5a1 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 10 Jul 2014 15:40:38 +0200 Subject: [PATCH] fixed bugs in canonical, robots and title/description unique calculation --- .../yacy/cora/federate/solr/SchemaConfiguration.java | 12 +++--------- .../yacy/search/schema/CollectionConfiguration.java | 10 +++++----- 2 files changed, 8 insertions(+), 14 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 5537b767b..c61955004 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -173,7 +173,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null; Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null; if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) && - (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) && + (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) && (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) && (httpstatus_i == null || httpstatus_i.intValue() == 200)) { uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] { @@ -190,14 +190,8 @@ public class SchemaConfiguration extends Configuration implements Serializable { continue uniquecheck; } try { - SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); - if (docs.getNumFound() == 0) { - sid.setField(uniquefield.getSolrFieldName(), true); - } else { - boolean firstappearance = true; - for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} - sid.setField(uniquefield.getSolrFieldName(), firstappearance); - } + long doccount = segment.fulltext().getDefaultConnector().getCountByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\""); + sid.setField(uniquefield.getSolrFieldName(), doccount == 0); } catch (final IOException e) {} } } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index a81320e30..5dba2e9d6 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -397,13 +397,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // we use the SolrCell design as index schema SolrVector doc = new SolrVector(); final DigestURL digestURL = document.dc_source(); - final String id = ASCII.String(digestURL.hash()); boolean allAttr = this.isEmpty(); String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL)); Set processTypes = new LinkedHashSet(); String host = digestURL.getHost(); - String us = digestURL.toNormalform(true); int crawldepth = document.getDepth(); if ((allAttr || contains(CollectionSchema.crawldepth_i))) { @@ -562,9 +560,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // bit 15: "noimageindex" contained in http header X-Robots-Tag // bit 16: "unavailable_after" contained in http header X-Robots-Tag int b = 0; - final String robots_meta = html.getMetas().get("robots"); + String robots_meta = html.getMetas().get("robots"); // this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html if (robots_meta != null) { + robots_meta = robots_meta.toLowerCase(); if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2 @@ -579,6 +578,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } if (!x_robots_tag.isEmpty()) { + x_robots_tag = x_robots_tag.toLowerCase(); // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9 @@ -754,14 +754,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } } } - if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { + if (canonical != null) { containsCanonical = true; inboundLinks.remove(canonical); outboundLinks.remove(canonical); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); // set a flag if this is equal to sku if (contains(CollectionSchema.canonical_equal_sku_b)) { - add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(us)); + add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL)); } } }