/ * *
* WebgraphConfiguration
* Copyright 2011 by Michael Peter Christen
* First released 14.04 .2011 at http : //yacy.net
*
* $LastChangedDate : 2011 - 04 - 14 22 : 05 : 04 + 0200 ( Do , 14 Apr 2011 ) $
* $LastChangedRevision : 7654 $
* $LastChangedBy : orbiter $
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2.1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.search.schema ;
import java.io.File ;
import java.io.IOException ;
import java.io.Serializable ;
import java.util.ArrayList ;
import java.util.Collection ;
import java.util.Date ;
import java.util.Iterator ;
import java.util.LinkedHashSet ;
import java.util.List ;
import java.util.Map ;
import java.util.Set ;
import java.util.concurrent.BlockingQueue ;
import java.util.regex.Pattern ;
import org.apache.solr.common.SolrDocument ;
import org.apache.solr.common.SolrInputDocument ;
import net.yacy.cora.document.encoding.ASCII ;
import net.yacy.cora.document.id.AnchorURL ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.federate.solr.ProcessType ;
import net.yacy.cora.federate.solr.SchemaConfiguration ;
import net.yacy.cora.federate.solr.SchemaDeclaration ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
import net.yacy.cora.protocol.Domains ;
import net.yacy.cora.protocol.ResponseHeader ;
import net.yacy.cora.util.CommonPattern ;
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.document.parser.html.ImageEntry ;
import net.yacy.search.index.Segment ;
import net.yacy.search.index.Segment.ClickdepthCache ;
public class WebgraphConfiguration extends SchemaConfiguration implements Serializable {
private static final long serialVersionUID = - 499100932212840385L ;
/ * *
* initialize with an empty ConfigurationSet which will cause that all the index
* attributes are used
* /
public WebgraphConfiguration ( boolean lazy ) {
super ( ) ;
this . lazy = lazy ;
}
/ * *
* initialize the schema with a given configuration file
* the configuration file simply contains a list of lines with keywords
* or keyword = value lines ( while value is a custom Solr field name
* @param configurationFile
* @throws IOException
* /
public WebgraphConfiguration ( final File configurationFile , boolean lazy ) throws IOException {
super ( configurationFile ) ;
this . lazy = lazy ;
// check consistency: compare with YaCyField enum
if ( this . isEmpty ( ) ) return ;
Iterator < Entry > it = this . entryIterator ( ) ;
for ( SchemaConfiguration . Entry etr = it . next ( ) ; it . hasNext ( ) ; etr = it . next ( ) ) {
try {
WebgraphSchema f = WebgraphSchema . valueOf ( etr . key ( ) ) ;
f . setSolrFieldName ( etr . getValue ( ) ) ;
} catch ( final IllegalArgumentException e ) {
ConcurrentLog . fine ( "SolrWebgraphWriter" , "solr schema file " + configurationFile . getAbsolutePath ( ) + " defines unknown attribute '" + etr . toString ( ) + "'" ) ;
it . remove ( ) ;
}
}
// check consistency the other way: look if all enum constants in SolrField appear in the configuration file
for ( SchemaDeclaration field : WebgraphSchema . values ( ) ) {
if ( this . get ( field . name ( ) ) = = null ) {
ConcurrentLog . warn ( "SolrWebgraphWriter" , " solr schema file " + configurationFile . getAbsolutePath ( ) + " is missing declaration for '" + field . name ( ) + "'" ) ;
}
}
}
public static class Subgraph {
public final ArrayList < String > [ ] urlProtocols , urlStubs , urlAnchorTexts ;
public final ArrayList < SolrInputDocument > edges ;
@SuppressWarnings ( "unchecked" )
public Subgraph ( int inboundSize , int outboundSize ) {
this . urlProtocols = new ArrayList [ ] { new ArrayList < String > ( inboundSize ) , new ArrayList < String > ( outboundSize ) } ;
this . urlStubs = new ArrayList [ ] { new ArrayList < String > ( inboundSize ) , new ArrayList < String > ( outboundSize ) } ;
this . urlAnchorTexts = new ArrayList [ ] { new ArrayList < String > ( inboundSize ) , new ArrayList < String > ( outboundSize ) } ;
this . edges = new ArrayList < SolrInputDocument > ( inboundSize + outboundSize ) ;
}
}
public void addEdges (
final Subgraph subgraph ,
final DigestURL source , final ResponseHeader responseHeader , Map < String , Pattern > collections , int clickdepth_source ,
final List < ImageEntry > images , final boolean inbound , final Collection < AnchorURL > links ,
final String sourceName ) {
boolean allAttr = this . isEmpty ( ) ;
boolean generalNofollow = responseHeader = = null ? false : responseHeader . get ( "X-Robots-Tag" , "" ) . indexOf ( "nofollow" ) > = 0 ;
int target_order = 0 ;
for ( final AnchorURL target_url : links ) {
SolrInputDocument edge = getEdge (
subgraph , source , responseHeader , collections , clickdepth_source , images , inbound ,
sourceName , allAttr , generalNofollow , target_order , target_url ) ;
target_order + + ;
// add the edge to the subgraph
subgraph . edges . add ( edge ) ;
}
}
public SolrInputDocument getEdge (
final Subgraph subgraph ,
final DigestURL source , final ResponseHeader responseHeader , Map < String , Pattern > collections , int clickdepth_source ,
final List < ImageEntry > images , final boolean inbound ,
final String sourceName , boolean allAttr , boolean generalNofollow , int target_order , AnchorURL target_url ) {
Set < ProcessType > processTypes = new LinkedHashSet < ProcessType > ( ) ;
final String name = target_url . getNameProperty ( ) ; // the name attribute
final String text = target_url . getTextProperty ( ) ; // the text between the <a></a> tag
String rel = target_url . getRelProperty ( ) ; // the rel-attribute
int ioidx = inbound ? 0 : 1 ;
if ( generalNofollow ) {
// patch the rel attribute since the header makes nofollow valid for all links
if ( rel . length ( ) = = 0 ) rel = "nofollow" ; else if ( rel . indexOf ( "nofollow" ) < 0 ) rel + = ",nofollow" ;
}
// index organization
StringBuilder idi = new StringBuilder ( 8 ) ;
idi . append ( Integer . toHexString ( ( name + text + rel ) . hashCode ( ) ) . toLowerCase ( ) ) ;
while ( idi . length ( ) < 8 ) idi . insert ( 0 , '0' ) ;
String source_id = ASCII . String ( source . hash ( ) ) ;
String target_id = ASCII . String ( target_url . hash ( ) ) ;
StringBuilder id = new StringBuilder ( source_id ) . append ( target_id ) . append ( idi ) ;
SolrInputDocument edge = new SolrInputDocument ( ) ;
add ( edge , WebgraphSchema . id , id . toString ( ) ) ;
add ( edge , WebgraphSchema . target_order_i , target_order ) ;
if ( allAttr | | contains ( WebgraphSchema . load_date_dt ) ) {
Date loadDate = new Date ( ) ;
Date modDate = responseHeader = = null ? new Date ( ) : responseHeader . lastModified ( ) ;
if ( modDate . getTime ( ) > loadDate . getTime ( ) ) modDate = loadDate ;
add ( edge , WebgraphSchema . load_date_dt , loadDate ) ;
}
if ( allAttr | | contains ( WebgraphSchema . last_modified ) ) add ( edge , WebgraphSchema . last_modified , responseHeader = = null ? new Date ( ) : responseHeader . lastModified ( ) ) ;
final String source_url_string = source . toNormalform ( false ) ;
if ( allAttr | | contains ( CollectionSchema . collection_sxt ) & & collections ! = null & & collections . size ( ) > 0 ) {
List < String > cs = new ArrayList < String > ( ) ;
for ( Map . Entry < String , Pattern > e : collections . entrySet ( ) ) {
if ( e . getValue ( ) . matcher ( source_url_string ) . matches ( ) ) cs . add ( e . getKey ( ) ) ;
}
add ( edge , WebgraphSchema . collection_sxt , cs ) ;
}
// add the source attributes
add ( edge , WebgraphSchema . source_id_s , source_id ) ;
int pr_source = source_url_string . indexOf ( "://" , 0 ) ;
if ( allAttr | | contains ( WebgraphSchema . source_protocol_s ) ) add ( edge , WebgraphSchema . source_protocol_s , source_url_string . substring ( 0 , pr_source ) ) ;
if ( allAttr | | contains ( WebgraphSchema . source_urlstub_s ) ) add ( edge , WebgraphSchema . source_urlstub_s , source_url_string . substring ( pr_source + 3 ) ) ;
Map < String , String > source_searchpart = source . getSearchpartMap ( ) ;
if ( source_searchpart = = null ) {
if ( allAttr | | contains ( WebgraphSchema . source_parameter_count_i ) ) add ( edge , WebgraphSchema . source_parameter_count_i , 0 ) ;
} else {
if ( allAttr | | contains ( WebgraphSchema . source_parameter_count_i ) ) add ( edge , WebgraphSchema . source_parameter_count_i , source_searchpart . size ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . source_parameter_key_sxt ) ) add ( edge , WebgraphSchema . source_parameter_key_sxt , source_searchpart . keySet ( ) . toArray ( new String [ source_searchpart . size ( ) ] ) ) ;
if ( allAttr | | contains ( WebgraphSchema . source_parameter_value_sxt ) ) add ( edge , WebgraphSchema . source_parameter_value_sxt , source_searchpart . values ( ) . toArray ( new String [ source_searchpart . size ( ) ] ) ) ;
}
if ( allAttr | | contains ( WebgraphSchema . source_chars_i ) ) add ( edge , WebgraphSchema . source_chars_i , source_url_string . length ( ) ) ;
String source_host = null ;
if ( ( source_host = source . getHost ( ) ) ! = null ) {
String dnc = Domains . getDNC ( source_host ) ;
String subdomOrga = source_host . length ( ) - dnc . length ( ) < = 0 ? "" : source_host . substring ( 0 , source_host . length ( ) - dnc . length ( ) - 1 ) ;
int pp = subdomOrga . lastIndexOf ( '.' ) ;
String subdom = ( pp < 0 ) ? "" : subdomOrga . substring ( 0 , pp ) ;
String orga = ( pp < 0 ) ? subdomOrga : subdomOrga . substring ( pp + 1 ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_s ) ) add ( edge , WebgraphSchema . source_host_s , source_host ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_id_s ) ) add ( edge , WebgraphSchema . source_host_id_s , source . hosthash ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_dnc_s ) ) add ( edge , WebgraphSchema . source_host_dnc_s , dnc ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_organization_s ) ) add ( edge , WebgraphSchema . source_host_organization_s , orga ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_organizationdnc_s ) ) add ( edge , WebgraphSchema . source_host_organizationdnc_s , orga + '.' + dnc ) ;
if ( allAttr | | contains ( WebgraphSchema . source_host_subdomain_s ) ) add ( edge , WebgraphSchema . source_host_subdomain_s , subdom ) ;
}
if ( allAttr | | contains ( WebgraphSchema . source_file_ext_s ) | | contains ( WebgraphSchema . source_file_name_s ) ) {
String source_file_name = source . getFileName ( ) ;
String source_file_ext = MultiProtocolURL . getFileExtension ( source_file_name ) ;
add ( edge , WebgraphSchema . source_file_name_s , source_file_name . toLowerCase ( ) . endsWith ( "." + source_file_ext ) ? source_file_name . substring ( 0 , source_file_name . length ( ) - source_file_ext . length ( ) - 1 ) : source_file_name ) ;
add ( edge , WebgraphSchema . source_file_ext_s , source_file_ext ) ;
}
if ( allAttr | | contains ( WebgraphSchema . source_path_s ) ) add ( edge , WebgraphSchema . source_path_s , source . getPath ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . source_path_folders_count_i ) | | contains ( WebgraphSchema . source_path_folders_sxt ) ) {
String [ ] paths = source . getPaths ( ) ;
add ( edge , WebgraphSchema . source_path_folders_count_i , paths . length ) ;
add ( edge , WebgraphSchema . source_path_folders_sxt , paths ) ;
}
if ( this . contains ( WebgraphSchema . source_clickdepth_i ) & & this . contains ( WebgraphSchema . source_protocol_s ) & & this . contains ( WebgraphSchema . source_urlstub_s ) & & this . contains ( WebgraphSchema . source_id_s ) ) {
add ( edge , WebgraphSchema . source_clickdepth_i , clickdepth_source ) ;
if ( clickdepth_source < 0 | | clickdepth_source > 1 ) processTypes . add ( ProcessType . CLICKDEPTH ) ;
}
// add the source attributes about the target
if ( allAttr | | contains ( WebgraphSchema . target_inbound_b ) ) add ( edge , WebgraphSchema . target_inbound_b , inbound ) ;
if ( allAttr | | contains ( WebgraphSchema . target_name_t ) ) add ( edge , WebgraphSchema . target_name_t , name . length ( ) > 0 ? name : "" ) ;
if ( allAttr | | contains ( WebgraphSchema . target_rel_s ) ) add ( edge , WebgraphSchema . target_rel_s , rel . length ( ) > 0 ? rel : "" ) ;
if ( allAttr | | contains ( WebgraphSchema . target_relflags_i ) ) add ( edge , WebgraphSchema . target_relflags_i , relEval ( rel . length ( ) > 0 ? rel : "" ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_linktext_t ) ) add ( edge , WebgraphSchema . target_linktext_t , text . length ( ) > 0 ? text : "" ) ;
if ( allAttr | | contains ( WebgraphSchema . target_linktext_charcount_i ) ) add ( edge , WebgraphSchema . target_linktext_charcount_i , text . length ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_linktext_wordcount_i ) ) add ( edge , WebgraphSchema . target_linktext_wordcount_i , text . length ( ) > 0 ? CommonPattern . SPACE . split ( text ) . length : 0 ) ;
ImageEntry ientry = null ;
for ( ImageEntry ie : images ) {
if ( ie . linkurl ( ) ! = null & & ie . linkurl ( ) . equals ( target_url ) ) { ientry = ie ; break ; }
}
String alttext = ientry = = null ? "" : ientry . alt ( ) ;
if ( allAttr | | contains ( WebgraphSchema . target_alt_t ) ) add ( edge , WebgraphSchema . target_alt_t , alttext ) ;
if ( allAttr | | contains ( WebgraphSchema . target_alt_charcount_i ) ) add ( edge , WebgraphSchema . target_alt_charcount_i , alttext . length ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_alt_wordcount_i ) ) add ( edge , WebgraphSchema . target_alt_wordcount_i , alttext . length ( ) > 0 ? CommonPattern . SPACE . split ( alttext ) . length : 0 ) ;
// add the target attributes
add ( edge , WebgraphSchema . target_id_s , target_id ) ;
final String target_url_string = target_url . toNormalform ( false ) ;
int pr_target = target_url_string . indexOf ( "://" , 0 ) ;
subgraph . urlProtocols [ ioidx ] . add ( target_url_string . substring ( 0 , pr_target ) ) ;
subgraph . urlStubs [ ioidx ] . add ( target_url_string . substring ( pr_target + 3 ) ) ;
subgraph . urlAnchorTexts [ ioidx ] . add ( text ) ;
if ( allAttr | | contains ( WebgraphSchema . target_protocol_s ) ) add ( edge , WebgraphSchema . target_protocol_s , target_url_string . substring ( 0 , pr_target ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_urlstub_s ) ) add ( edge , WebgraphSchema . target_urlstub_s , target_url_string . substring ( pr_target + 3 ) ) ;
Map < String , String > target_searchpart = target_url . getSearchpartMap ( ) ;
if ( target_searchpart = = null ) {
if ( allAttr | | contains ( WebgraphSchema . target_parameter_count_i ) ) add ( edge , WebgraphSchema . target_parameter_count_i , 0 ) ;
} else {
if ( allAttr | | contains ( WebgraphSchema . target_parameter_count_i ) ) add ( edge , WebgraphSchema . target_parameter_count_i , target_searchpart . size ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_parameter_key_sxt ) ) add ( edge , WebgraphSchema . target_parameter_key_sxt , target_searchpart . keySet ( ) . toArray ( new String [ target_searchpart . size ( ) ] ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_parameter_value_sxt ) ) add ( edge , WebgraphSchema . target_parameter_value_sxt , target_searchpart . values ( ) . toArray ( new String [ target_searchpart . size ( ) ] ) ) ;
}
if ( allAttr | | contains ( WebgraphSchema . target_chars_i ) ) add ( edge , WebgraphSchema . target_chars_i , target_url_string . length ( ) ) ;
String target_host = null ;
if ( ( target_host = target_url . getHost ( ) ) ! = null ) {
String dnc = Domains . getDNC ( target_host ) ;
String subdomOrga = target_host . length ( ) - dnc . length ( ) < = 0 ? "" : target_host . substring ( 0 , target_host . length ( ) - dnc . length ( ) - 1 ) ;
int pp = subdomOrga . lastIndexOf ( '.' ) ;
String subdom = ( pp < 0 ) ? "" : subdomOrga . substring ( 0 , pp ) ;
String orga = ( pp < 0 ) ? subdomOrga : subdomOrga . substring ( pp + 1 ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_s ) ) add ( edge , WebgraphSchema . target_host_s , target_host ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_id_s ) ) add ( edge , WebgraphSchema . target_host_id_s , target_url . hosthash ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_dnc_s ) ) add ( edge , WebgraphSchema . target_host_dnc_s , dnc ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_organization_s ) ) add ( edge , WebgraphSchema . target_host_organization_s , orga ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_organizationdnc_s ) ) add ( edge , WebgraphSchema . target_host_organizationdnc_s , orga + '.' + dnc ) ;
if ( allAttr | | contains ( WebgraphSchema . target_host_subdomain_s ) ) add ( edge , WebgraphSchema . target_host_subdomain_s , subdom ) ;
}
if ( allAttr | | contains ( WebgraphSchema . target_file_ext_s ) | | contains ( WebgraphSchema . target_file_name_s ) ) {
String target_file_name = target_url . getFileName ( ) ;
String target_file_ext = MultiProtocolURL . getFileExtension ( target_file_name ) ;
add ( edge , WebgraphSchema . target_file_name_s , target_file_name . toLowerCase ( ) . endsWith ( "." + target_file_ext ) ? target_file_name . substring ( 0 , target_file_name . length ( ) - target_file_ext . length ( ) - 1 ) : target_file_name ) ;
add ( edge , WebgraphSchema . target_file_ext_s , target_file_ext ) ;
}
if ( allAttr | | contains ( WebgraphSchema . target_path_s ) ) add ( edge , WebgraphSchema . target_path_s , target_url . getPath ( ) ) ;
if ( allAttr | | contains ( WebgraphSchema . target_path_folders_count_i ) | | contains ( WebgraphSchema . target_path_folders_sxt ) ) {
String [ ] paths = target_url . getPaths ( ) ;
add ( edge , WebgraphSchema . target_path_folders_count_i , paths . length ) ;
add ( edge , WebgraphSchema . target_path_folders_sxt , paths ) ;
}
if ( this . contains ( WebgraphSchema . target_protocol_s ) & & this . contains ( WebgraphSchema . target_urlstub_s ) & & this . contains ( WebgraphSchema . target_id_s ) ) {
if ( ( allAttr | | contains ( WebgraphSchema . target_clickdepth_i ) ) ) {
if ( target_url . probablyRootURL ( ) ) {
boolean lc = this . lazy ; this . lazy = false ;
add ( edge , WebgraphSchema . target_clickdepth_i , 0 ) ;
this . lazy = lc ;
} else {
add ( edge , WebgraphSchema . target_clickdepth_i , 999 ) ;
processTypes . add ( ProcessType . CLICKDEPTH ) ; // postprocessing needed; this is also needed if the depth is positive; there could be a shortcut
}
}
}
if ( allAttr | | contains ( WebgraphSchema . process_sxt ) ) {
List < String > pr = new ArrayList < String > ( ) ;
for ( ProcessType t : processTypes ) pr . add ( t . name ( ) ) ;
add ( edge , WebgraphSchema . process_sxt , pr ) ;
if ( allAttr | | contains ( CollectionSchema . harvestkey_s ) ) {
add ( edge , CollectionSchema . harvestkey_s , sourceName ) ;
}
}
// return the edge
return edge ;
}
public int postprocessing ( final Segment segment , ClickdepthCache clickdepthCache , final String harvestkey ) {
if ( ! this . contains ( WebgraphSchema . process_sxt ) ) return 0 ;
if ( ! segment . fulltext ( ) . useWebgraph ( ) ) return 0 ;
SolrConnector webgraphConnector = segment . fulltext ( ) . getWebgraphConnector ( ) ;
// that means we must search for those entries.
webgraphConnector . commit ( true ) ; // make sure that we have latest information that can be found
//BlockingQueue<SolrDocument> docs = index.fulltext().getSolr().concurrentQuery("*:*", 0, 1000, 60000, 10);
String query = ( harvestkey = = null | | ! this . contains ( WebgraphSchema . harvestkey_s ) ? "" : WebgraphSchema . harvestkey_s . getSolrFieldName ( ) + ":\"" + harvestkey + "\" AND " ) + WebgraphSchema . process_sxt . getSolrFieldName ( ) + AbstractSolrConnector . CATCHALL_DTERM ;
BlockingQueue < SolrDocument > docs = webgraphConnector . concurrentDocumentsByQuery ( query , 0 , 10000000 , 1800000 , 100 ) ;
SolrDocument doc ;
String protocol , urlstub , id ;
DigestURL url ;
int proccount = 0 , proccount_clickdepthchange = 0 ;
try {
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
// for each to-be-processed entry work on the process tag
Collection < Object > proctags = doc . getFieldValues ( WebgraphSchema . process_sxt . getSolrFieldName ( ) ) ;
try {
SolrInputDocument sid = this . toSolrInputDocument ( doc ) ;
//boolean changed = false;
for ( Object tag : proctags ) {
// switch over tag types
ProcessType tagtype = ProcessType . valueOf ( ( String ) tag ) ;
if ( tagtype = = ProcessType . CLICKDEPTH ) {
if ( this . contains ( WebgraphSchema . source_clickdepth_i ) & & this . contains ( WebgraphSchema . source_protocol_s ) & & this . contains ( WebgraphSchema . source_urlstub_s ) & & this . contains ( WebgraphSchema . source_id_s ) ) {
protocol = ( String ) doc . getFieldValue ( WebgraphSchema . source_protocol_s . getSolrFieldName ( ) ) ;
urlstub = ( String ) doc . getFieldValue ( WebgraphSchema . source_urlstub_s . getSolrFieldName ( ) ) ;
id = ( String ) doc . getFieldValue ( WebgraphSchema . source_id_s . getSolrFieldName ( ) ) ;
url = new DigestURL ( protocol + "://" + urlstub , ASCII . getBytes ( id ) ) ;
if ( postprocessing_clickdepth ( clickdepthCache , sid , url , WebgraphSchema . source_clickdepth_i , 100 ) ) {
proccount_clickdepthchange + + ;
//changed = true;
}
//ConcurrentLog.info("WebgraphConfiguration", "postprocessing webgraph source id " + id + ", url=" + protocol + "://" + urlstub + ", result: " + (changed ? "changed" : "not changed"));
}
if ( this . contains ( WebgraphSchema . target_clickdepth_i ) & & this . contains ( WebgraphSchema . target_protocol_s ) & & this . contains ( WebgraphSchema . target_urlstub_s ) & & this . contains ( WebgraphSchema . target_id_s ) ) {
protocol = ( String ) doc . getFieldValue ( WebgraphSchema . target_protocol_s . getSolrFieldName ( ) ) ;
urlstub = ( String ) doc . getFieldValue ( WebgraphSchema . target_urlstub_s . getSolrFieldName ( ) ) ;
id = ( String ) doc . getFieldValue ( WebgraphSchema . target_id_s . getSolrFieldName ( ) ) ;
url = new DigestURL ( protocol + "://" + urlstub , ASCII . getBytes ( id ) ) ;
if ( postprocessing_clickdepth ( clickdepthCache , sid , url , WebgraphSchema . target_clickdepth_i , 100 ) ) {
proccount_clickdepthchange + + ;
//changed = true;
}
//ConcurrentLog.info("WebgraphConfiguration", "postprocessing webgraph target id " + id + ", url=" + protocol + "://" + urlstub + ", result: " + (changed ? "changed" : "not changed"));
}
}
}
// all processing steps checked, remove the processing tag
sid . removeField ( WebgraphSchema . process_sxt . getSolrFieldName ( ) ) ;
if ( this . contains ( WebgraphSchema . harvestkey_s ) ) sid . removeField ( WebgraphSchema . harvestkey_s . getSolrFieldName ( ) ) ;
// send back to index
webgraphConnector . add ( sid ) ;
proccount + + ;
} catch ( Throwable e1 ) {
ConcurrentLog . warn ( WebgraphConfiguration . class . getName ( ) , "postprocessing failed" , e1 ) ;
}
}
ConcurrentLog . info ( "WebgraphConfiguration" , "cleanup_processing: re-calculated " + proccount + " new documents, " + proccount_clickdepthchange + " clickdepth values changed." ) ;
} catch ( final InterruptedException e ) {
}
return proccount ;
}
/ * *
* encode a string containing attributes from anchor rel properties binary :
* bit 0 : "me" contained in rel
* bit 1 : "nofollow" contained in rel
* @param rel
* @return binary encoded information about rel
* /
private static int relEval ( final String rels ) {
int i = 0 ;
final String s0 = rels . toLowerCase ( ) . trim ( ) ;
if ( "me" . equals ( s0 ) ) i + = 1 ;
if ( "nofollow" . equals ( s0 ) ) i + = 2 ;
return i ;
}
/ * *
* save configuration to file and update enum SolrFields
* @throws IOException
* /
@Override
public void commit ( ) throws IOException {
try {
super . commit ( ) ;
// make sure the enum SolrField.SolrFieldName is current
Iterator < Entry > it = this . entryIterator ( ) ;
for ( SchemaConfiguration . Entry etr = it . next ( ) ; it . hasNext ( ) ; etr = it . next ( ) ) {
try {
SchemaDeclaration f = WebgraphSchema . valueOf ( etr . key ( ) ) ;
f . setSolrFieldName ( etr . getValue ( ) ) ;
} catch ( final IllegalArgumentException e ) {
continue ;
}
}
} catch ( final IOException e ) { }
}
/ * *
* Convert a SolrDocument to a SolrInputDocument .
* This is useful if a document from the search index shall be modified and indexed again .
* This shall be used as replacement of ClientUtils . toSolrInputDocument because we remove some fields
* which are created automatically during the indexing process .
* @param doc the solr document
* @return a solr input document
* /
public SolrInputDocument toSolrInputDocument ( SolrDocument doc ) {
SolrInputDocument sid = new SolrInputDocument ( ) ;
for ( String name : doc . getFieldNames ( ) ) {
if ( this . contains ( name ) ) { // check each field if enabled in local Solr schema
sid . addField ( name , doc . getFieldValue ( name ) , 1.0f ) ;
}
}
return sid ;
}
}