## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b
#title_exact_signature_l
## flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false, boolean
## flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false, boolean
#title_unique_b
## id of the host, a 6-byte hash that is part of the document id (mandatory field)
@ -144,7 +144,7 @@ description_txt
## the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b
#description_exact_signature_l
## flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false, boolean
## flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false, boolean
#description_unique_b
## content of keywords tag; words are separated by space
longcount=segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s.getSolrFieldName()+":\""+hostid+"\" AND "+checkfield.getSolrFieldName()+":\""+Long.toString(signature)+"\"");
if(count>1){
Stringurlhash=ASCII.String(url.hash());
if(uniqueURLs.contains(urlhash)){
// this is not the first appearance, therefore this is a non-unique document
SolrDocumentListdocs=segment.fulltext().getDefaultConnector().getDocumentListByQuery("-"+CollectionSchema.id.getSolrFieldName()+":\""+urlhash+"\" AND "+CollectionSchema.host_id_s.getSolrFieldName()+":\""+hostid+"\" AND "+signaturefield.getSolrFieldName()+":\""+signature.toString()+"\"",null,0,100,CollectionSchema.id.getSolrFieldName());
finalSolrDocumentListdocs=segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s.getSolrFieldName()+":\""+hostid+"\" AND "+signaturefield.getSolrFieldName()+":\""+checkhash.toString()+"\"",null,0,1);
SolrDocumentListdocs=segment.fulltext().getDefaultConnector().getDocumentListByQuery("-"+CollectionSchema.id.getSolrFieldName()+":\""+urlhash+"\" AND "+CollectionSchema.host_id_s.getSolrFieldName()+":\""+hostid+"\" AND "+signaturefield.getSolrFieldName()+":\""+signature.toString()+"\"",null,0,100,CollectionSchema.id.getSolrFieldName());
@ -1308,7 +1306,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if(count!=countcheck)ConcurrentLog.warn("CollectionConfiguration","ambiguous collection document count for harvestkey "+harvestkey+": expected="+count+", counted="+countcheck);// big gap for harvestkey = null
ConcurrentLog.info("CollectionConfiguration","cleanup_processing: re-calculated "+proccount+" new documents, "+
@ -40,7 +40,7 @@ public enum CollectionSchema implements SchemaDeclaration {
www_unique_b(SolrType.bool,true,true,false,false,false,"unique-field which is true when an url appears the first time. If the same url within the subdomain www then appears without that subdomain (or vice versa) then the field is false"),
title(SolrType.text_general,true,true,true,false,true,"content of title tag"),
title_exact_signature_l(SolrType.num_long,true,true,false,false,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of title, used to compute title_unique_b"),
title_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if title is unique in the whole index; if yes and another document appears with same title, the unique-flag is set to false"),
title_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if title is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same title, the unique-flag is set to false"),
host_id_s(SolrType.string,true,true,false,false,false,"id of the host, a 6-byte hash that is part of the document id"),// String hosthash();
md5_s(SolrType.string,true,true,false,false,false,"the md5 of the raw source"),// String md5();
exact_signature_l(SolrType.num_long,true,true,false,false,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of text_t"),
@ -82,7 +82,7 @@ public enum CollectionSchema implements SchemaDeclaration {
author_sxt(SolrType.string,true,true,true,false,false,"content of author-tag as copy-field from author. This is used for facet generation"),
description_txt(SolrType.text_general,true,true,true,false,true,"content of description-tag(s)"),
description_exact_signature_l(SolrType.num_long,true,true,false,false,false,"the 64 bit hash of the org.apache.solr.update.processor.Lookup3Signature of description, used to compute description_unique_b"),
description_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if description is unique in the whole index; if yes and another document appears with same description, the unique-flag is set to false"),
description_unique_b(SolrType.bool,true,true,false,false,false,"flag shows if description is unique within all indexable documents of the same host with status code 200; if yes and another document appears with same description, the unique-flag is set to false"),
keywords(SolrType.text_general,true,true,false,false,true,"content of keywords tag; words are separated by space"),