added new solr title_exact_signature_l and

description_exact_signature_l to be able to identify unique title and
unique description fields.
pull/1/head
Michael Peter Christen 12 years ago
parent f24ac518e6
commit 7ab5093321

@ -24,6 +24,9 @@ content_type
## content of title tag, text (mandatory field)
title
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b
#title_exact_signature_l
## flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false, boolean
#title_unique_b
@ -123,6 +126,9 @@ author
## content of description-tag, text
description
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b
#description_exact_signature_l
## flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false, boolean
#description_unique_b

@ -170,4 +170,17 @@ public class EnhancedTextProfileSignature extends Lookup3Signature {
}
}
public static long getSignatureLong(String text) {
Lookup3Signature sig = new Lookup3Signature();
sig.add(text);
return getSignatureLong(sig);
}
public static long getSignatureLong(Lookup3Signature sig) {
byte[] hash = sig.getSignature();
long l = 0;
for (int i = 0; i < 8; i++) l = (l << 8) + (hash[i] & 0xff);
return l;
}
}

@ -38,7 +38,6 @@ import java.util.SortedSet;
import java.util.TreeMap;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.update.processor.Lookup3Signature;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.WordCache;
@ -242,15 +241,9 @@ public final class Condenser {
sp.put("minTokenLen", Integer.toString(Ranking.getMinTokenLen()));
fuzzySignatureFactory.init(new MapSolrParams(sp));
fuzzySignatureFactory.add(text);
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
this.fuzzy_signature = l;
this.fuzzy_signature = EnhancedTextProfileSignature.getSignatureLong(fuzzySignatureFactory);
this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
Lookup3Signature exactSignatureFactory = new Lookup3Signature();
exactSignatureFactory.add(text);
byte[] exact_signature_hash = exactSignatureFactory.getSignature();
l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
this.exact_signature = l;
this.exact_signature = EnhancedTextProfileSignature.getSignatureLong(text);
}
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {

@ -36,7 +36,6 @@ import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
@ -511,25 +510,28 @@ public class Segment {
}
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
if (this.fulltext.getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
String hostid = url.hosthash();
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.title, CollectionSchema.title_unique_b},
{CollectionSchema.description, CollectionSchema.description_unique_b}}) {
{CollectionSchema.title, CollectionSchema.title_exact_signature_l, CollectionSchema.title_unique_b},
{CollectionSchema.description, CollectionSchema.description_exact_signature_l, CollectionSchema.description_unique_b}}) {
CollectionSchema checkfield = checkfields[0];
CollectionSchema uniquefield = checkfields[1];
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
// lookup in the index for the same title
String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
if (checkstring.length() == 0) {
CollectionSchema signaturefield = checkfields[1];
CollectionSchema uniquefield = checkfields[2];
if (this.fulltext.getDefaultConfiguration().contains(checkfield) && this.fulltext.getDefaultConfiguration().contains(signaturefield) && this.fulltext.getDefaultConfiguration().contains(uniquefield)) {
// lookup in the index within the same hosts for the same title or description
//String checkstring = checkfield == CollectionSchema.title ? document.dc_title() : document.dc_description();
Long checkhash = (Long) vector.getFieldValue(signaturefield.getSolrFieldName());
if (checkhash == null) {
vector.setField(uniquefield.getSolrFieldName(), false);
continue uniquecheck;
}
checkstring = ClientUtils.escapeQueryChars("\"" + checkstring + "\"");
try {
if (this.fulltext.getDefaultConnector().existsByQuery(checkfield.getSolrFieldName() + ":\"" + checkstring + "\"")) {
if (this.fulltext.getDefaultConnector().existsByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"")) {
// switch unique attribute in new document
vector.setField(uniquefield.getSolrFieldName(), false);
// switch attribute also in all existing documents (which should be exactly only one!)
SolrDocumentList docs = this.fulltext.getDefaultConnector().query(checkfield.getSolrFieldName() + ":" + checkstring + " AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
SolrDocumentList docs = this.fulltext.getDefaultConnector().query(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\" AND " + uniquefield.getSolrFieldName() + ":true", 0, 1000);
for (SolrDocument doc: docs) {
SolrInputDocument sid = this.fulltext.getDefaultConfiguration().toSolrInputDocument(doc);
sid.setField(uniquefield.getSolrFieldName(), false);
@ -541,6 +543,7 @@ public class Segment {
} catch (IOException e) {}
}
}
}
// ENRICH DOCUMENT WITH RANKING INFORMATION
if (this.connectedCitation()) {

@ -44,6 +44,7 @@ import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.FailType;
@ -387,7 +388,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
List<String> titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) add(doc, CollectionSchema.title, titles);
if (allAttr || contains(CollectionSchema.title)) {
add(doc, CollectionSchema.title, titles);
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
}
}
if (allAttr || contains(CollectionSchema.title_count_i)) add(doc, CollectionSchema.title_count_i, titles.size());
if (allAttr || contains(CollectionSchema.title_chars_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(titles.size());
@ -403,7 +410,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String description = document.dc_description();
List<String> descriptions = new ArrayList<String>();
for (String s: CommonPattern.NEWLINE.split(description)) descriptions.add(s);
if (allAttr || contains(CollectionSchema.description)) add(doc, CollectionSchema.description, description);
if (allAttr || contains(CollectionSchema.description)) {
add(doc, CollectionSchema.description, description);
if ((allAttr || contains(CollectionSchema.description_exact_signature_l)) && description != null && description.length() > 0) {
add(doc, CollectionSchema.description_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(description));
}
}
if (allAttr || contains(CollectionSchema.description_count_i)) add(doc, CollectionSchema.description_count_i, descriptions.size());
if (allAttr || contains(CollectionSchema.description_chars_val)) {
ArrayList<Integer> cv = new ArrayList<Integer>(descriptions.size());

@ -36,6 +36,7 @@ public enum CollectionSchema implements SchemaDeclaration {
last_modified(SolrType.date, true, true, false, false, false, "last-modified from http header"),
content_type(SolrType.string, true, true, true, false, false, "mime-type of document"),
title(SolrType.text_general, true, true, true, false, true, "content of title tag"),
title_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"),
title_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),
host_id_s(SolrType.string, true, true, false, false, false, "id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string, true, true, false, false, false, "the md5 of the raw source"),// String md5();
@ -74,6 +75,7 @@ public enum CollectionSchema implements SchemaDeclaration {
author(SolrType.text_general, true, true, false, false, true, "content of author-tag"),
author_sxt(SolrType.string, true, true, true, false, false, "content of author-tag as copy-field from author. This is used for facet generation"),
description(SolrType.text_general, true, true, false, false, true, "content of description-tag"),
description_exact_signature_l(SolrType.num_long, true, true, false, false, false, "the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
description_unique_b(SolrType.bool, true, true, false, false, false, "flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general, true, true, false, false, true, "content of keywords tag; words are separated by space"),
charset_s(SolrType.string, true, true, false, false, false, "character encoding"),

Loading…
Cancel
Save