diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index db8c14e6c..5439ddee2 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -33,7 +33,7 @@ title ## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b #title_exact_signature_l -## flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false, boolean +## flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false, boolean #title_unique_b ## id of the host, a 6-byte hash that is part of the document id (mandatory field) @@ -144,7 +144,7 @@ description_txt ## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b #description_exact_signature_l -## flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false, boolean +## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean #description_unique_b ## content of keywords tag; words are separated by space diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index b98b0e738..5537b767b 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -107,74 +107,76 @@ public class SchemaConfiguration extends Configuration implements Serializable { return sd; } - public boolean postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) { - if (!this.contains(CollectionSchema.http_unique_b)) return false; - if (!url.isHTTPS() && !url.isHTTP()) return false; + public void postprocessing_http_unique(Segment segment, SolrInputDocument sid, DigestURL url) { + if (!this.contains(CollectionSchema.http_unique_b)) return; + if (!url.isHTTPS() && !url.isHTTP()) return; try { DigestURL u = new DigestURL((url.isHTTP() ? "https://" : "http://") + url.urlstub(true, true)); SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.http_unique_b.getSolrFieldName()); - return set_unique_flag(CollectionSchema.http_unique_b, sid, d); + set_unique_flag(CollectionSchema.http_unique_b, sid, d); } catch (final IOException e) {} - return false; } - public boolean postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) { - if (!this.contains(CollectionSchema.www_unique_b)) return false; + public void postprocessing_www_unique(Segment segment, SolrInputDocument sid, DigestURL url) { + if (!this.contains(CollectionSchema.www_unique_b)) return; final String us = url.urlstub(true, true); try { DigestURL u = new DigestURL(url.getProtocol() + (us.startsWith("www.") ? "://" + us.substring(4) : "://www." + us)); SolrDocument d = segment.fulltext().getDefaultConnector().getDocumentById(ASCII.String(u.hash()), CollectionSchema.www_unique_b.getSolrFieldName()); - return set_unique_flag(CollectionSchema.www_unique_b, sid, d); + set_unique_flag(CollectionSchema.www_unique_b, sid, d); } catch (final IOException e) {} - return false; } - private boolean set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) { + private void set_unique_flag(CollectionSchema field, SolrInputDocument sid, SolrDocument d) { Object sb = sid.getFieldValue(field.getSolrFieldName()); boolean sbb = sb != null && ((Boolean) sb).booleanValue(); Object ob = d == null ? null : d.getFieldValue(field.getSolrFieldName()); boolean obb = ob != null && ((Boolean) ob).booleanValue(); - if (sbb == obb) { - sid.setField(field.getSolrFieldName(), !sbb); - return true; - } - return false; + if (sbb == obb) sid.setField(field.getSolrFieldName(), !sbb); } - public boolean postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURL url) { - boolean changed = false; + public void postprocessing_doublecontent(Segment segment, Set uniqueURLs, SolrInputDocument sid, DigestURL url) { // FIND OUT IF THIS IS A DOUBLE DOCUMENT + String urlhash = ASCII.String(url.hash()); String hostid = url.hosthash(); - for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { - CollectionSchema checkfield = checkfields[0]; + CollectionSchema signaturefield = checkfields[0]; CollectionSchema uniquefield = checkfields[1]; CollectionSchema countfield = checkfields[2]; - if (this.contains(checkfield) && this.contains(uniquefield)) { + if (this.contains(signaturefield) && this.contains(uniquefield) && this.contains(countfield)) { // lookup the document with the same signature - long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue(); + Long signature = (Long) sid.getField(signaturefield.getSolrFieldName()).getValue(); + if (signature == null) continue uniquecheck; try { - long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\""); - if (count > 1) { - String urlhash = ASCII.String(url.hash()); - if (uniqueURLs.contains(urlhash)) { - // this is not the first appearance, therefore this is a non-unique document - sid.setField(uniquefield.getSolrFieldName(), false); - } else { - // this is the first appearance, therefore this shall be treated as unique document - sid.setField(uniquefield.getSolrFieldName(), true); - uniqueURLs.add(urlhash); - } - sid.setField(countfield.getSolrFieldName(), count); - changed = true; + SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); + if (docs.getNumFound() == 0) { + sid.setField(uniquefield.getSolrFieldName(), true); + sid.setField(countfield.getSolrFieldName(), 1); + } else { + boolean firstappearance = true; + for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} + sid.setField(uniquefield.getSolrFieldName(), firstappearance); + sid.setField(countfield.getSolrFieldName(), docs.getNumFound() + 1); // the current url was excluded from search but is included in count } } catch (final IOException e) {} } } + // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) - if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { - uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + // in case that the document has no status code 200, has a noindex attribute + // or a canonical tag which does not point to the document itself, + // then the unique-field is not written at all! + Integer robots_i = this.contains(CollectionSchema.robots_i) ? (Integer) sid.getFieldValue(CollectionSchema.robots_i.getSolrFieldName()) : null; + Integer httpstatus_i = this.contains(CollectionSchema.httpstatus_i) ? (Integer) sid.getFieldValue(CollectionSchema.httpstatus_i.getSolrFieldName()) : null; + String canonical_s = this.contains(CollectionSchema.canonical_s) ? (String) sid.getFieldValue(CollectionSchema.canonical_s.getSolrFieldName()) : null; + Boolean canonical_equal_sku_b = this.contains(CollectionSchema.canonical_equal_sku_b) ? (Boolean) sid.getFieldValue(CollectionSchema.canonical_equal_sku_b.getSolrFieldName()) : null; + if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s) && + (robots_i == null || (robots_i.intValue() & (1 << 9)) == 0) && + (canonical_s == null || canonical_s.length() == 0 || (canonical_equal_sku_b != null && canonical_equal_sku_b.booleanValue()) || url.toNormalform(true).equals(canonical_s)) && + (httpstatus_i == null || httpstatus_i.intValue() == 200)) { + uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][] { {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, {CollectionSchema.description_txt, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { CollectionSchema checkfield = checkfields[0]; @@ -183,26 +185,24 @@ public class SchemaConfiguration extends Configuration implements Serializable { if (this.contains(checkfield) && this.contains(signaturefield) && this.contains(uniquefield)) { // lookup in the index within the same hosts for the same title or description //String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); - Long checkhash = (Long) sid.getFieldValue(signaturefield.getSolrFieldName()); - if (checkhash == null) { - sid.setField(uniquefield.getSolrFieldName(), false); - changed = true; + Long signature = (Long) sid.getFieldValue(signaturefield.getSolrFieldName()); + if (signature == null) { continue uniquecheck; } try { - final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1); - if (docs != null && !docs.isEmpty()) { - // switch unique attribute in new document - sid.setField(uniquefield.getSolrFieldName(), false); - changed = true; - } else { + SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery("-" + CollectionSchema.id.getSolrFieldName() + ":\"" + urlhash + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + signature.toString() + "\"", null, 0, 100, CollectionSchema.id.getSolrFieldName()); + if (docs.getNumFound() == 0) { sid.setField(uniquefield.getSolrFieldName(), true); + } else { + boolean firstappearance = true; + for (SolrDocument d: docs) {if (uniqueURLs.contains(d.getFieldValue(CollectionSchema.id.getSolrFieldName()))) firstappearance = false; break;} + sid.setField(uniquefield.getSolrFieldName(), firstappearance); } } catch (final IOException e) {} } } } - return changed; + uniqueURLs.add(urlhash); } public boolean postprocessing_references(final ReferenceReportCache rrCache, final SolrInputDocument sid, final DigestURL url, final Map hostExtentCount) { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index b24b31d12..a81320e30 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1163,7 +1163,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // set cr values if (tagtype == ProcessType.CITATION) { - if (webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { + if (segment.fulltext().useWebgraph() && webgraph.contains(WebgraphSchema.source_id_s) && webgraph.contains(WebgraphSchema.source_cr_host_norm_i)) { id = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); CRV crv = rankings.get(id); if (crv != null) { @@ -1221,7 +1221,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set omitFields = new HashSet(); omitFields.add(CollectionSchema.process_sxt.getSolrFieldName()); omitFields.add(CollectionSchema.harvestkey_s.getSolrFieldName()); - int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0, proccount_uniquechange = 0; + int proccount = 0, proccount_referencechange = 0, proccount_citationchange = 0; long count = collectionConnector.getCountByQuery(collection1query); long start = System.currentTimeMillis(); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); @@ -1263,11 +1263,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } if (tagtype == ProcessType.UNIQUE) { - boolean uniquechange = false; - uniquechange |= postprocessing_http_unique(segment, sid, url); - uniquechange |= postprocessing_www_unique(segment, sid, url); - uniquechange |= postprocessing_doublecontent(segment, uniqueURLs, sid, url); - if (uniquechange) proccount_uniquechange++; + postprocessing_http_unique(segment, sid, url); + postprocessing_www_unique(segment, sid, url); + postprocessing_doublecontent(segment, uniqueURLs, sid, url); } } catch (IllegalArgumentException e) {} @@ -1308,7 +1306,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (count != countcheck) ConcurrentLog.warn("CollectionConfiguration", "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck); // big gap for harvestkey = null ConcurrentLog.info("CollectionConfiguration", "cleanup_processing: re-calculated " + proccount+ " new documents, " + proccount_referencechange + " reference-count changes, " + - proccount_uniquechange + " unique field changes, " + proccount_citationchange + " citation ranking changes."); } catch (final InterruptedException e2) { ConcurrentLog.warn("CollectionConfiguration", e2.getMessage(), e2); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 9339cc09a..0db00542d 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -40,7 +40,7 @@ public enum CollectionSchema implements SchemaDeclaration { www_unique_b(SolrType.bool, true, true, false, false, false, "unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"), title(SolrType.text_general, true, true, true, false, true, "content of title tag"), title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), - title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"), + title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"), host_id_s(SolrType.string, true, true, false, false, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5(); exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"), @@ -82,7 +82,7 @@ public enum CollectionSchema implements SchemaDeclaration { author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"), description_txt(SolrType.text_general, true, true, true, false, true, "content of description-tag(s)"), description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"), - description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"), + description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"), keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"), charset_s(SolrType.string, true, true, false, false, false, "character encoding"), wordcount_i(SolrType.num_integer, true, true, false, false, false, "number of words in visible area"),