@ -82,6 +82,7 @@ import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser ;
import net.yacy.document.Condenser ;
import net.yacy.document.Document ;
import net.yacy.document.Document ;
import net.yacy.document.SentenceReader ;
import net.yacy.document.SentenceReader ;
import net.yacy.document.Tokenizer ;
import net.yacy.document.content.DCEntry ;
import net.yacy.document.content.DCEntry ;
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.ContentScraper ;
import net.yacy.document.parser.html.ImageEntry ;
import net.yacy.document.parser.html.ImageEntry ;
@ -301,7 +302,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
String keywords = md . dc_subject ( ) ;
String keywords = md . dc_subject ( ) ;
Bitfield flags = md . flags ( ) ;
Bitfield flags = md . flags ( ) ;
if ( flags . get ( Condens er. flag_cat_indexof ) ) {
if ( flags . get ( Tokeniz er. flag_cat_indexof ) ) {
if ( keywords = = null | | keywords . isEmpty ( ) ) keywords = "indexof" ; else {
if ( keywords = = null | | keywords . isEmpty ( ) ) keywords = "indexof" ; else {
if ( keywords . indexOf ( ',' ) > 0 ) keywords + = ", indexof" ; else keywords + = " indexof" ;
if ( keywords . indexOf ( ',' ) > 0 ) keywords + = ", indexof" ; else keywords + = " indexof" ;
}
}
@ -511,10 +512,6 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
add ( doc , CollectionSchema . keywords , keywords ) ;
add ( doc , CollectionSchema . keywords , keywords ) ;
}
}
if ( allAttr | | contains ( CollectionSchema . synonyms_sxt ) ) {
List < String > synonyms = condenser . synonyms ( ) ;
add ( doc , CollectionSchema . synonyms_sxt , synonyms ) ;
}
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
// unique-fields; these values must be corrected during postprocessing. (the following logic is !^ (not-xor) but I prefer to write it that way as it is)
add ( doc , CollectionSchema . http_unique_b , setUnique | | UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL . isHTTPS ( ) : digestURL . isHTTP ( ) ) ; // this must be corrected afterwards during storage!
add ( doc , CollectionSchema . http_unique_b , setUnique | | UNIQUE_HEURISTIC_PREFER_HTTPS ? digestURL . isHTTPS ( ) : digestURL . isHTTP ( ) ) ; // this must be corrected afterwards during storage!
@ -993,12 +990,29 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if ( allAttr | | contains ( CollectionSchema . videolinkscount_i ) ) add ( doc , CollectionSchema . videolinkscount_i , document . getVideolinks ( ) . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . videolinkscount_i ) ) add ( doc , CollectionSchema . videolinkscount_i , document . getVideolinks ( ) . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . applinkscount_i ) ) add ( doc , CollectionSchema . applinkscount_i , document . getApplinks ( ) . size ( ) ) ;
if ( allAttr | | contains ( CollectionSchema . applinkscount_i ) ) add ( doc , CollectionSchema . applinkscount_i , document . getApplinks ( ) . size ( ) ) ;
// document post-processing
if ( ( allAttr | | contains ( CollectionSchema . process_sxt ) ) & & processTypes . size ( ) > 0 ) {
List < String > p = new ArrayList < String > ( ) ;
for ( ProcessType t : processTypes ) p . add ( t . name ( ) ) ;
add ( doc , CollectionSchema . process_sxt , p ) ;
if ( allAttr | | contains ( CollectionSchema . harvestkey_s ) ) {
add ( doc , CollectionSchema . harvestkey_s , sourceName ) ;
}
}
// document enrichments (synonyms, facets)
enrich ( doc , condenser . synonyms ( ) , document . getGenericFacets ( ) ) ;
return doc ;
}
public void enrich ( SolrInputDocument doc , List < String > synonyms , Map < String , Set < String > > genericFacets ) {
if ( this . isEmpty ( ) | | contains ( CollectionSchema . vocabularies_sxt ) ) {
// write generic navigation
// write generic navigation
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// there are no pre-defined solr fields for navigation because the vocabulary is generic
// we use dynamically allocated solr fields for this.
// we use dynamically allocated solr fields for this.
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
// It must be a multi-value string/token field, therefore we use _sxt extensions for the field names
List < String > vocabularies = new ArrayList < > ( ) ;
List < String > vocabularies = new ArrayList < > ( ) ;
for ( Map . Entry < String , Set < String > > facet : document . getGenericFacets ( ) . entrySet ( ) ) {
for ( Map . Entry < String , Set < String > > facet : genericFacets. entrySet ( ) ) {
String facetName = facet . getKey ( ) ;
String facetName = facet . getKey ( ) ;
Set < String > facetValues = facet . getValue ( ) ;
Set < String > facetValues = facet . getValue ( ) ;
int count = facetValues . size ( ) ;
int count = facetValues . size ( ) ;
@ -1011,21 +1025,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
doc . setField ( CollectionSchema . VOCABULARY_PREFIX + facetName + CollectionSchema . VOCABULARY_LOGCOUNTS_SUFFIX , counts ) ;
doc . setField ( CollectionSchema . VOCABULARY_PREFIX + facetName + CollectionSchema . VOCABULARY_LOGCOUNTS_SUFFIX , counts ) ;
vocabularies . add ( facetName ) ;
vocabularies . add ( facetName ) ;
}
}
if ( ( allAttr | | contains ( CollectionSchema . vocabularies_sxt ) ) & & vocabularies . size ( ) > 0 ) {
if ( vocabularies . size ( ) > 0 ) add ( doc , CollectionSchema . vocabularies_sxt , vocabularies ) ;
add ( doc , CollectionSchema . vocabularies_sxt , vocabularies ) ;
}
}
if ( this . isEmpty ( ) | | contains ( CollectionSchema . synonyms_sxt ) ) {
if ( synonyms . size ( ) > 0 ) add ( doc , CollectionSchema . synonyms_sxt , synonyms ) ;
if ( ( allAttr | | contains ( CollectionSchema . process_sxt ) ) & & processTypes . size ( ) > 0 ) {
List < String > p = new ArrayList < String > ( ) ;
for ( ProcessType t : processTypes ) p . add ( t . name ( ) ) ;
add ( doc , CollectionSchema . process_sxt , p ) ;
if ( allAttr | | contains ( CollectionSchema . harvestkey_s ) ) {
add ( doc , CollectionSchema . harvestkey_s , sourceName ) ;
}
}
}
}
return doc ;
}
public static boolean postprocessingRunning = false ;
public static boolean postprocessingRunning = false ;
public static String postprocessingActivity = "" ;
public static String postprocessingActivity = "" ;