fixed bugs in canonical, robots and title/description unique calculation

pull/1/head
Michael Peter Christen 11 years ago
parent d9472d043a
commit b0d941626f

@ -173,7 +173,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null; String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null;
Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null; Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null;
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) && if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) &&
(robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) && (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0 /*noindex in http X-ROBOTS*/ && (robots_i.intValue() & (1 << 3)) == 0 /*noindex in html metas*/ ) &&
(canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) && (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) &&
(httpstatus_i == null || httpstatus_i.intValue() == 200)) { (httpstatus_i == null || httpstatus_i.intValue() == 200)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] { uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] {
@ -190,14 +190,8 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck; continue uniquecheck;
} }
try { try {
SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); long doccount = segment.fulltext().getDefaultConnector().getCountByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"");
if (docs.getNumFound() == 0) { sid.setField(uniquefield.getSolrFieldName(), doccount == 0);
sid.setField(uniquefield.getSolrFieldName(), true);
} else {
boolean firstappearance = true;
for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;}
sid.setField(uniquefield.getSolrFieldName(), firstappearance);
}
} catch (final IOException e) {} } catch (final IOException e) {}
} }
} }

@ -397,13 +397,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// we use the SolrCell design as index schema // we use the SolrCell design as index schema
SolrVector doc = new SolrVector(); SolrVector doc = new SolrVector();
final DigestURL digestURL = document.dc_source(); final DigestURL digestURL = document.dc_source();
final String id = ASCII.String(digestURL.hash());
boolean allAttr = this.isEmpty(); boolean allAttr = this.isEmpty();
String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL)); String url = addURIAttributes(doc, allAttr, digestURL, Response.docType(digestURL));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>(); Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
String host = digestURL.getHost(); String host = digestURL.getHost();
String us = digestURL.toNormalform(true);
int crawldepth = document.getDepth(); int crawldepth = document.getDepth();
if ((allAttr || contains(CollectionSchema.crawldepth_i))) { if ((allAttr || contains(CollectionSchema.crawldepth_i))) {
@ -562,9 +560,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// bit 15: "noimageindex" contained in http header X-Robots-Tag // bit 15: "noimageindex" contained in http header X-Robots-Tag
// bit 16: "unavailable_after" contained in http header X-Robots-Tag // bit 16: "unavailable_after" contained in http header X-Robots-Tag
int b = 0; int b = 0;
final String robots_meta = html.getMetas().get("robots"); String robots_meta = html.getMetas().get("robots");
// this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html // this tag may have values: all, index, noindex, nofollow; see http://www.robotstxt.org/meta.html
if (robots_meta != null) { if (robots_meta != null) {
robots_meta = robots_meta.toLowerCase();
if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0 if (robots_meta.indexOf("all",0) >= 0) b += 1; // set bit 0
if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1 if (robots_meta.indexOf("index",0) == 0 || robots_meta.indexOf(" index",0) >= 0 || robots_meta.indexOf(",index",0) >= 0 ) b += 2; // set bit 1
if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2 if (robots_meta.indexOf("follow",0) == 0 || robots_meta.indexOf(" follow",0) >= 0 || robots_meta.indexOf(",follow",0) >= 0 ) b += 4; // set bit 2
@ -579,6 +578,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
} }
if (!x_robots_tag.isEmpty()) { if (!x_robots_tag.isEmpty()) {
x_robots_tag = x_robots_tag.toLowerCase();
// this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de // this tag may have values: all, noindex, nofollow, noarchive, nosnippet, noodp, notranslate, noimageindex, unavailable_after, none; see https://developers.google.com/webmasters/control-crawl-index/docs/robots_meta_tag?hl=de
if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8 if (x_robots_tag.indexOf("all",0) >= 0) b += 1<<8; // set bit 8
if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9 if (x_robots_tag.indexOf("noindex",0) >= 0||x_robots_tag.indexOf("none",0) >= 0) b += 1<<9; // set bit 9
@ -754,14 +754,14 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} }
} }
} }
if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { if (canonical != null) {
containsCanonical = true; containsCanonical = true;
inboundLinks.remove(canonical); inboundLinks.remove(canonical);
outboundLinks.remove(canonical); outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku // set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b)) { if (contains(CollectionSchema.canonical_equal_sku_b)) {
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(us)); add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(digestURL));
} }
} }
} }

Loading…
Cancel
Save