diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index be104511b..b32ae0aa0 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -24,6 +24,9 @@ content_type ## content of title tag, text (mandatory field) title +## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b +#title_exact_signature_l + ## flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false, boolean #title_unique_b @@ -123,6 +126,9 @@ author ## content of description-tag, text description +## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b +#description_exact_signature_l + ## flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false, boolean #description_unique_b diff --git a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java index af991e4e0..0c5b7a0c7 100644 --- a/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java +++ b/source/net/yacy/cora/document/analysis/EnhancedTextProfileSignature.java @@ -169,5 +169,18 @@ public class EnhancedTextProfileSignature extends Lookup3Signature { return t2.cnt - t1.cnt; } } - + + public static long getSignatureLong(String text) { + Lookup3Signature sig = new Lookup3Signature(); + sig.add(text); + return getSignatureLong(sig); + } + + public static long getSignatureLong(Lookup3Signature sig) { + byte[] hash = sig.getSignature(); + long l = 0; + for (int i = 0; i < 8; i++) l = (l << 8) + (hash[i] & 0xff); + return l; + } + } diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index d217f40d7..30ecfa8fb 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -38,7 +38,6 @@ import java.util.SortedSet; import java.util.TreeMap; import org.apache.solr.common.params.MapSolrParams; -import org.apache.solr.update.processor.Lookup3Signature; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.WordCache; @@ -242,15 +241,9 @@ public final class Condenser { sp.put("minTokenLen", Integer.toString(Ranking.getMinTokenLen())); fuzzySignatureFactory.init(new MapSolrParams(sp)); fuzzySignatureFactory.add(text); - byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature(); - long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff); - this.fuzzy_signature = l; + this.fuzzy_signature = EnhancedTextProfileSignature.getSignatureLong(fuzzySignatureFactory); this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString(); - Lookup3Signature exactSignatureFactory = new Lookup3Signature(); - exactSignatureFactory.add(text); - byte[] exact_signature_hash = exactSignatureFactory.getSignature(); - l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff); - this.exact_signature = l; + this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text); } private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index f8c02eb50..70087656a 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -36,7 +36,6 @@ import java.util.Properties; import java.util.Set; import java.util.concurrent.BlockingQueue; -import org.apache.solr.client.solrj.util.ClientUtils; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; @@ -511,34 +510,38 @@ public class Segment { } // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) - uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ - {CollectionSchema.title, CollectionSchema.title_unique_b}, - {CollectionSchema.description, CollectionSchema.description_unique_b}}) { - CollectionSchema checkfield = checkfields[0]; - CollectionSchema uniquefield = checkfields[1]; - if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { - // lookup in the index for the same title - String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); - if (checkstring.length() == 0) { - vector.setField(uniquefield.getSolrFieldName(), false); - continue uniquecheck; - } - checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\""); - try { - if (this.fulltext.getDefaultConnector().existsByQuery(checkfield.getSolrFieldName() + ":\"" + checkstring + "\"")) { - // switch unique attribute in new document + if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { + String hostid = url.hosthash(); + uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ + {CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b}, + {CollectionSchema.description, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) { + CollectionSchema checkfield = checkfields[0]; + CollectionSchema signaturefield = checkfields[1]; + CollectionSchema uniquefield = checkfields[2]; + if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(signaturefield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) { + // lookup in the index within the same hosts for the same title or description + //String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description(); + Long checkhash = (Long) vector.getFieldValue(signaturefield.getSolrFieldName()); + if (checkhash == null) { vector.setField(uniquefield.getSolrFieldName(), false); - // switch attribute also in all existing documents (which should be exactly only one!) - SolrDocumentList docs = this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); - for (SolrDocument doc: docs) { - SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc); - sid.setField(uniquefield.getSolrFieldName(), false); - this.fulltext.getDefaultConnector().add(sid); - } - } else { - vector.setField(uniquefield.getSolrFieldName(), true); + continue uniquecheck; } - } catch (IOException e) {} + try { + if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) { + // switch unique attribute in new document + vector.setField(uniquefield.getSolrFieldName(), false); + // switch attribute also in all existing documents (which should be exactly only one!) + SolrDocumentList docs = this.fulltext.getDefaultConnector().query(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000); + for (SolrDocument doc: docs) { + SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc); + sid.setField(uniquefield.getSolrFieldName(), false); + this.fulltext.getDefaultConnector().add(sid); + } + } else { + vector.setField(uniquefield.getSolrFieldName(), true); + } + } catch (IOException e) {} + } } } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 08528bbf8..84a2db16e 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -44,6 +44,7 @@ import java.util.concurrent.BlockingQueue; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; +import net.yacy.cora.document.analysis.EnhancedTextProfileSignature; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.FailType; @@ -387,7 +388,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } List titles = document.titles(); - if (allAttr || contains(CollectionSchema.title)) add(doc, CollectionSchema.title, titles); + if (allAttr || contains(CollectionSchema.title)) { + add(doc, CollectionSchema.title, titles); + if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) { + add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0))); + } + + } if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, titles.size()); if (allAttr || contains(CollectionSchema.title_chars_val)) { ArrayList cv = new ArrayList(titles.size()); @@ -403,7 +410,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri String description = document.dc_description(); List descriptions = new ArrayList(); for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s); - if (allAttr || contains(CollectionSchema.description)) add(doc, CollectionSchema.description, description); + if (allAttr || contains(CollectionSchema.description)) { + add(doc, CollectionSchema.description, description); + if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && description != null && description.length() > 0) { + add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(description)); + } + } if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.size()); if (allAttr || contains(CollectionSchema.description_chars_val)) { ArrayList cv = new ArrayList(descriptions.size()); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 0817e0071..eaae9c535 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -36,6 +36,7 @@ public enum CollectionSchema implements SchemaDeclaration { last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"), content_type(SolrType.string, true, true, true, false, false, "mime-type of document"), title(SolrType.text_general, true, true, true, false, true, "content of title tag"), + title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"), title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"), host_id_s(SolrType.string, true, true, false, false, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash(); md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5(); @@ -74,6 +75,7 @@ public enum CollectionSchema implements SchemaDeclaration { author(SolrType.text_general, true, true, false, false, true, "content of author-tag"), author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"), description(SolrType.text_general, true, true, false, false, true, "content of description-tag"), + description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"), description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"), keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"), charset_s(SolrType.string, true, true, false, false, false, "character encoding"),