@ -39,6 +39,7 @@ import java.util.LinkedHashSet;
import java.util.List ;
import java.util.Map ;
import java.util.Set ;
import java.util.concurrent.BlockingQueue ;
import net.yacy.cora.document.ASCII ;
import net.yacy.cora.document.MultiProtocolURI ;
@ -48,11 +49,12 @@ import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.FailType ;
import net.yacy.cora.federate.solr.ProcessType ;
import net.yacy.cora.federate.solr.SchemaDeclaration ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
import net.yacy.cora.protocol.Domains ;
import net.yacy.cora.protocol.HeaderFramework ;
import net.yacy.cora.protocol.ResponseHeader ;
import net.yacy.cora.util.CommonPattern ;
import net.yacy.cora.util.SpaceExceededException ;
import net.yacy.crawler.data.CrawlProfile ;
import net.yacy.crawler.retrieval.Response ;
import net.yacy.document.Condenser ;
@ -62,12 +64,10 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.citation.CitationReference ;
import net.yacy.kelondro.data.meta.DigestURI ;
import net.yacy.kelondro.data.meta.URIMetadataRow ;
import net.yacy.kelondro.index.RowHandleSet ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.rwi.IndexCell ;
import net.yacy.kelondro.rwi.ReferenceContainer ;
import net.yacy.kelondro.util.Bitfield ;
import net.yacy. kelondro.util.ByteBuffer ;
import net.yacy. search.index.Segment ;
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrInputDocument ;
@ -761,63 +761,77 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return doc ;
}
/ * *
* compute the click level using the citation reference database
* @param citations the citation database
* @param searchhash the hash of the url to be checked
* @return the clickdepth level or 999 if the root url cannot be found or a recursion limit is reached
* @throws IOException
* post - processing steps for all entries that have a process tag assigned
* @param connector
* @param urlCitation
* @return
* /
public static int getClickDepth ( final IndexCell < CitationReference > citations , final DigestURI url ) throws IOException {
final byte [ ] searchhash = url . hash ( ) ;
RowHandleSet rootCandidates = url . getPossibleRootHashes ( ) ;
RowHandleSet ignore = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ; // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
RowHandleSet levelhashes = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 1 ) ; // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
try { levelhashes . put ( searchhash ) ; } catch ( SpaceExceededException e ) { throw new IOException ( e ) ; }
int leveldepth = 0 ; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte [ ] hosthash = new byte [ 6 ] ; // the host of the url to be checked
System . arraycopy ( searchhash , 6 , hosthash , 0 , 6 ) ;
long timeout = System . currentTimeMillis ( ) + 10000 ;
for ( int maxdepth = 0 ; maxdepth < 10 & & System . currentTimeMillis ( ) < timeout ; maxdepth + + ) {
RowHandleSet checknext = new RowHandleSet ( URIMetadataRow . rowdef . primaryKeyLength , URIMetadataRow . rowdef . objectOrder , 100 ) ;
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop : for ( byte [ ] urlhash : levelhashes ) {
// get all the citations for this url and iterate
ReferenceContainer < CitationReference > references = citations . get ( urlhash , null ) ;
if ( references = = null | | references . size ( ) = = 0 ) continue checkloop ; // don't know
Iterator < CitationReference > i = references . entries ( ) ;
nextloop : while ( i . hasNext ( ) ) {
CitationReference ref = i . next ( ) ;
if ( ref = = null ) continue nextloop ;
byte [ ] u = ref . urlhash ( ) ;
public void postprocessing ( Segment segment ) {
if ( ! this . contains ( CollectionSchema . process_sxt ) ) return ;
if ( ! segment . connectedCitation ( ) ) return ;
SolrConnector connector = segment . fulltext ( ) . getDefaultConnector ( ) ;
// that means we must search for those entries.
connector . commit ( true ) ; // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
BlockingQueue < SolrDocument > docs = connector . concurrentQuery ( CollectionSchema . process_sxt . getSolrFieldName ( ) + ":[* TO *]" , 0 , 10000 , 60000 , 50 ) ;
SolrDocument doc ;
int proccount_clickdepth = 0 ;
int proccount_clickdepthchange = 0 ;
int proccount_referencechange = 0 ;
try {
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
// for each to-be-processed entry work on the process tag
Collection < Object > proctags = doc . getFieldValues ( CollectionSchema . process_sxt . getSolrFieldName ( ) ) ;
for ( Object tag : proctags ) {
String tagname = ( String ) tag ;
ProcessType tagtype = ProcessType . valueOf ( tagname ) ;
// switch over tag types
if ( tagtype = = ProcessType . CLICKDEPTH ) {
//proctags.remove(tag);
if ( this . contains ( CollectionSchema . clickdepth_i ) ) {
DigestURI url ;
try {
// get new click depth and compare with old
Integer oldclickdepth = ( Integer ) doc . getFieldValue ( CollectionSchema . clickdepth_i . getSolrFieldName ( ) ) ;
url = new DigestURI ( ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) , ASCII . getBytes ( ( String ) doc . getFieldValue ( CollectionSchema . id . getSolrFieldName ( ) ) ) ) ;
int clickdepth = segment . getClickDepth ( url ) ;
if ( oldclickdepth = = null | | oldclickdepth . intValue ( ) ! = clickdepth ) {
//log.logInfo("new clickdepth " + clickdepth + " for " + url.toNormalform(true));
proccount_clickdepthchange + + ;
}
SolrInputDocument sid = this . toSolrInputDocument ( doc ) ;
sid . setField ( CollectionSchema . clickdepth_i . getSolrFieldName ( ) , clickdepth ) ;
// check ignore
if ( ignore . has ( u ) ) continue nextloop ;
// refresh the link count; it's 'cheap' to do this here
if ( this . contains ( CollectionSchema . references_i ) ) {
Integer oldreferences = ( Integer ) doc . getFieldValue ( CollectionSchema . references_i . getSolrFieldName ( ) ) ;
int references = segment . urlCitation ( ) . count ( url . hash ( ) ) ;
if ( references > 0 ) {
if ( oldreferences = = null | | oldreferences . intValue ( ) ! = references ) proccount_referencechange + + ;
sid . setField ( CollectionSchema . references_i . getSolrFieldName ( ) , references ) ;
}
}
// check if this is from the same host
if ( ! ByteBuffer . equals ( u , 6 , hosthash , 0 , 6 ) ) continue nextloop ;
// remove the processing tag
sid . removeField ( CollectionSchema . process_sxt . getSolrFieldName ( ) ) ;
// check if the url is a root url
if ( rootCandidates . has ( u ) ) {
return leveldepth + 1 ;
// send back to index
connector . add ( sid ) ;
proccount_clickdepth + + ;
} catch ( Throwable e ) {
Log . logException ( e ) ;
}
// step to next depth level
try { checknext . put ( u ) ; } catch ( SpaceExceededException e ) { }
try { ignore . put ( u ) ; } catch ( SpaceExceededException e ) { }
}
}
leveldepth + + ;
levelhashes = checknext ;
}
return 999 ;
}
Log . logInfo ( "CollectionConfiguration" , "cleanup_processing: re-calculated " + proccount_clickdepth + " new clickdepth values, " + proccount_clickdepthchange + " clickdepth values changed, " + proccount_referencechange + " reference-count values changed." ) ;
} catch ( InterruptedException e ) {
}
}
/ * *