@ -1227,27 +1227,40 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
// process all documents in collection
Map < String , Long > hostExtentCache = new HashMap < String , Long > ( ) ; // a mapping from the host id to the number of documents which contain this host-id
Set < String > uniqueURLs = new HashSet < String > ( ) ;
final Map < String , Long > hostExtentCache = new HashMap < String , Long > ( ) ; // a mapping from the host id to the number of documents which contain this host-id
final Set < String > uniqueURLs = new HashSet < String > ( ) ;
try {
Set < String > omitFields = new HashSet < String > ( ) ;
final Set < String > omitFields = new HashSet < String > ( ) ;
omitFields . add ( CollectionSchema . process_sxt . getSolrFieldName ( ) ) ;
omitFields . add ( CollectionSchema . harvestkey_s . getSolrFieldName ( ) ) ;
int proccount = 0 , proccount_referencechange = 0 , proccount_citationchange = 0 ;
long count = collectionConnector . getCountByQuery ( collection1query ) ;
long start = System . currentTimeMillis ( ) ;
final long count = collectionConnector . getCountByQuery ( collection1query ) ;
final long start = System . currentTimeMillis ( ) ;
final int concurrency = Runtime . getRuntime ( ) . availableProcessors ( ) ;
final boolean reference_computation = this . contains ( CollectionSchema . references_i ) & &
this . contains ( CollectionSchema . references_internal_i ) & &
this . contains ( CollectionSchema . references_external_i ) & &
this . contains ( CollectionSchema . references_exthosts_i ) ;
postprocessingActivity = "collecting " + count + " documents from the collection for harvestkey " + harvestkey ;
ConcurrentLog . info ( "CollectionConfiguration" , postprocessingActivity ) ;
BlockingQueue < SolrDocument > docs = collectionConnector . concurrentDocumentsByQuery (
final BlockingQueue < SolrDocument > docs = collectionConnector . concurrentDocumentsByQuery (
collection1query ,
( this . contains ( CollectionSchema . http_unique_b ) | | this . contains ( CollectionSchema . www_unique_b ) ) ?
CollectionSchema . host_subdomain_s . getSolrFieldName ( ) + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema . url_protocol_s . getSolrFieldName ( ) + " asc" // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
: null , // null sort is faster!
0 , 100000000 , Long . MAX_VALUE , 200 , 1 ) ;
int countcheck = 0 ;
Collection < String > failids = new ArrayList < String > ( ) ;
0 , 100000000 , Long . MAX_VALUE , 100 , concurrency ) ;
final AtomicInteger proccount = new AtomicInteger ( ) ;
final AtomicInteger proccount_referencechange = new AtomicInteger ( ) ;
final AtomicInteger proccount_citationchange = new AtomicInteger ( ) ;
final AtomicInteger countcheck = new AtomicInteger ( 0 ) ;
final Collection < String > failids = new ArrayList < String > ( ) ;
final Thread rewriteThread [ ] = new Thread [ concurrency ] ;
for ( int rewrite_start = 0 ; rewrite_start < concurrency ; rewrite_start + + ) {
rewriteThread [ rewrite_start ] = new Thread ( ) {
@Override
public void run ( ) {
SolrDocument doc ;
try {
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
// for each to-be-processed entry work on the process tag
Collection < Object > proctags = doc . getFieldValues ( CollectionSchema . process_sxt . getSolrFieldName ( ) ) ;
@ -1272,7 +1285,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
sid . setField ( CollectionSchema . cr_host_count_i . getSolrFieldName ( ) , crv . count ) ;
sid . setField ( CollectionSchema . cr_host_chance_d . getSolrFieldName ( ) , crv . cr ) ;
sid . setField ( CollectionSchema . cr_host_norm_i . getSolrFieldName ( ) , crv . crn ) ;
proccount_citationchange + + ;
proccount_citationchange . incrementAndGet ( ) ;
}
}
@ -1284,7 +1297,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} catch ( IllegalArgumentException e ) { }
// refresh the link count; it's 'cheap' to do this here
// compute references
if ( reference_computation ) {
String hosthash = url . hosthash ( ) ;
if ( ! hostExtentCache . containsKey ( hosthash ) ) {
StringBuilder q = new StringBuilder ( ) ;
@ -1292,11 +1306,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
long hostExtentCount = segment . fulltext ( ) . getDefaultConnector ( ) . getCountByQuery ( q . toString ( ) ) ;
hostExtentCache . put ( hosthash , hostExtentCount ) ;
}
if ( this . contains ( CollectionSchema . references_i ) & &
this . contains ( CollectionSchema . references_internal_i ) & &
this . contains ( CollectionSchema . references_external_i ) & &
this . contains ( CollectionSchema . references_exthosts_i ) ) {
if ( postprocessing_references ( rrCache , sid , url , hostExtentCache ) ) proccount_referencechange + + ;
if ( postprocessing_references ( rrCache , sid , url , hostExtentCache ) ) proccount_referencechange . incrementAndGet ( ) ;
}
// all processing steps checked, remove the processing and harvesting key
@ -1307,24 +1317,34 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
//collectionConnector.deleteById(i);
collectionConnector . add ( sid ) ;
proccount + + ; allcount . incrementAndGet ( ) ;
if ( proc count % 100 = = 0 ) {
postprocessingActivity = "postprocessed " + proc count + " from " + count + " collection documents; " +
( proc count * 60000 / ( System . currentTimeMillis ( ) - start ) ) + " ppm; " +
( ( System . currentTimeMillis ( ) - start ) * ( count - proccount) / proc count / 60000 ) + " minutes remaining" ;
int thiscount = proccount . incrementAndGet ( ) ; allcount . incrementAndGet ( ) ;
if ( this count % 100 = = 0 ) {
postprocessingActivity = "postprocessed " + this count + " from " + count + " collection documents; " +
( this count * 60000 / ( System . currentTimeMillis ( ) - start ) ) + " ppm; " +
( ( System . currentTimeMillis ( ) - start ) * ( count - thiscount) / this count / 60000 ) + " minutes remaining" ;
ConcurrentLog . info ( "CollectionConfiguration" , postprocessingActivity ) ;
}
} catch ( final Throwable e1 ) {
ConcurrentLog . logException ( e1 ) ;
failids . add ( i ) ;
}
countcheck + + ;
countcheck . incrementAndGet ( ) ;
}
} catch ( InterruptedException e ) {
ConcurrentLog . logException ( e ) ;
}
}
} ;
rewriteThread [ rewrite_start ] . start ( ) ;
}
// wait for termination
for ( int rewrite_start = 0 ; rewrite_start < concurrency ; rewrite_start + + ) rewriteThread [ rewrite_start ] . join ( ) ;
if ( failids . size ( ) > 0 ) {
ConcurrentLog . info ( "CollectionConfiguration" , "cleanup_processing: deleting " + failids . size ( ) + " documents which have permanent execution fails" ) ;
collectionConnector . deleteByIds ( failids ) ;
}
if ( count ! = countcheck ) ConcurrentLog . warn ( "CollectionConfiguration" , "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck ) ; // big gap for harvestkey = null
if ( count ! = countcheck .get ( ) ) ConcurrentLog . warn ( "CollectionConfiguration" , "ambiguous collection document count for harvestkey " + harvestkey + ": expected=" + count + ", counted=" + countcheck ) ; // big gap for harvestkey = null
ConcurrentLog . info ( "CollectionConfiguration" , "cleanup_processing: re-calculated " + proccount + " new documents, " +
proccount_referencechange + " reference-count changes, " +
proccount_citationchange + " citation ranking changes." ) ;