@ -54,6 +54,10 @@ import net.yacy.server.serverSwitch;
public class HostBrowser {
public static enum StoreType {
LINK , INDEX , ERROR ;
}
public static serverObjects respond ( final RequestHeader header , final serverObjects post , final serverSwitch env ) {
// return variable that accumulates replacements
final Switchboard sb = ( Switchboard ) env ;
@ -131,27 +135,35 @@ public class HostBrowser {
if ( post . containsKey ( "hosts" ) ) {
// generate host list
try {
int maxcount = 360 ; // == 6!/2 which makes nice matrixes for 3, 4, 5, 6 rows/colums
int maxcount = admin ? 2 * 3 * 2 * 5 * 7 * 2 * 3 : 360 ; // which makes nice matrixes for 2, 3, 4, 5, 6, 7, 8, 9 rows/colums
// collect from index
ReversibleScoreMap < String > score = fulltext . getSolr ( ) . getFacet ( YaCySchema . host_s . name ( ) , maxcount ) ;
// collect hosts from index
ReversibleScoreMap < String > hostscore = fulltext . getSolr ( ) . getFacets ( "*:*" , new String [ ] { YaCySchema . host_s . name ( ) } , maxcount ) . get ( YaCySchema . host_s . name ( ) ) ;
if ( hostscore = = null ) hostscore = new ClusteredScoreMap < String > ( ) ;
// collect from crawler
// collect hosts from crawler
final Map < String , Integer [ ] > crawler = ( admin ) ? sb . crawlQueues . noticeURL . getDomainStackHosts ( StackType . LOCAL , sb . robots ) : new HashMap < String , Integer [ ] > ( ) ;
for ( Map . Entry < String , Integer [ ] > host : crawler . entrySet ( ) ) {
score. inc ( host . getKey ( ) , host . getValue ( ) [ 0 ] ) ;
host score. inc ( host . getKey ( ) , host . getValue ( ) [ 0 ] ) ;
}
// collect the errorurls
ReversibleScoreMap < String > errorscore = admin ? fulltext . getSolr ( ) . getFacets ( YaCySchema . failreason_t . name ( ) + ":[* TO *]" , new String [ ] { YaCySchema . host_s . name ( ) } , maxcount ) . get ( YaCySchema . host_s . name ( ) ) : null ;
if ( errorscore = = null ) errorscore = new ClusteredScoreMap < String > ( ) ;
int c = 0 ;
Iterator < String > i = score . keys ( false ) ;
Iterator < String > i = host score. keys ( false ) ;
String host ;
while ( i . hasNext ( ) & & c < maxcount ) {
host = i . next ( ) ;
prop . put ( "hosts_list_" + c + "_host" , host ) ;
prop . put ( "hosts_list_" + c + "_count" , score. get ( host ) ) ;
prop . put ( "hosts_list_" + c + "_count" , host score. get ( host ) ) ;
boolean inCrawler = crawler . containsKey ( host ) ;
prop . put ( "hosts_list_" + c + "_crawler" , inCrawler ? 1 : 0 ) ;
if ( inCrawler ) prop . put ( "hosts_list_" + c + "_crawler_pending" , crawler . get ( host ) [ 0 ] ) ;
int errors = errorscore . get ( host ) ;
prop . put ( "hosts_list_" + c + "_errors" , errors > 0 ? 1 : 0 ) ;
if ( errors > 0 ) prop . put ( "hosts_list_" + c + "_errors_count" , errors ) ;
c + + ;
}
prop . put ( "hosts_list" , c ) ;
@ -201,10 +213,10 @@ public class HostBrowser {
} else {
if ( facetcount > 1000 & & ! post . containsKey ( "nepr" ) ) q . append ( " AND " ) . append ( YaCySchema . url_paths_sxt . name ( ) ) . append ( ":[* TO *]" ) ;
}
q . append ( " AND -" ) . append ( YaCySchema . failreason_t . name ( ) ) . append ( ":[* TO *]" ) ;
BlockingQueue < SolrDocument > docs = fulltext . getSolr ( ) . concurrentQuery ( q . toString ( ) , 0 , 100000 , 3000 , 100 ) ;
SolrDocument doc ;
Set < String > storedDocs = new HashSet < String > ( ) ;
Map < String , String > errorDocs = new HashMap < String , String > ( ) ;
Set < String > inboundLinks = new HashSet < String > ( ) ;
Map < String , ReversibleScoreMap < String > > outboundHosts = new HashMap < String , ReversibleScoreMap < String > > ( ) ;
int hostsize = 0 ;
@ -212,16 +224,18 @@ public class HostBrowser {
long timeout = System . currentTimeMillis ( ) + 3000 ;
while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
String u = ( String ) doc . getFieldValue ( YaCySchema . sku . getSolrFieldName ( ) ) ;
hostsize+ + ;
String error = ( String ) doc . getFieldValue ( YaCySchema . failreason_t . name ( ) ) ;
if ( u . startsWith ( path ) ) {
if ( delete ) {
deleteIDs . add ( ASCII . getBytes ( ( String ) doc . getFieldValue ( YaCySchema . id . name ( ) ) ) ) ;
} else {
storedDocs . add ( u ) ;
if ( error = = null ) storedDocs . add ( u ) ; else if ( admin ) errorDocs . put ( u , error ) ;
}
} else if ( complete ) {
storedDocs . add ( u ) ;
if ( error = = null ) storedDocs . add ( u ) ; else if ( admin ) errorDocs . put ( u , error ) ;
}
if ( error = = null ) {
hostsize + + ;
// collect inboundlinks to browse the host
Iterator < String > links = URIMetadataNode . getLinks ( doc , true ) ;
while ( links . hasNext ( ) ) {
@ -245,9 +259,13 @@ public class HostBrowser {
}
} catch ( MalformedURLException e ) { }
}
}
if ( System . currentTimeMillis ( ) > timeout ) break ;
}
if ( deleteIDs . size ( ) > 0 ) sb . index . fulltext ( ) . remove ( deleteIDs , true ) ;
if ( deleteIDs . size ( ) > 0 ) {
for ( byte [ ] b : deleteIDs ) sb . crawlQueues . urlRemove ( b ) ;
sb . index . fulltext ( ) . remove ( deleteIDs , true ) ;
}
// collect from crawler
List < Request > domainStackReferences = ( admin ) ? sb . crawlQueues . noticeURL . getDomainStackReferences ( StackType . LOCAL , host , 1000 , 3000 ) : new ArrayList < Request > ( 0 ) ;
@ -255,43 +273,46 @@ public class HostBrowser {
for ( Request crawlEntry : domainStackReferences ) loadingLinks . add ( crawlEntry . url ( ) . toNormalform ( true ) ) ;
// now combine all lists into one
Map < String , Boolean > files = new HashMap < String , Boolean > ( ) ;
for ( String u : storedDocs ) files . put ( u , true ) ;
for ( String u : inboundLinks ) if ( ! storedDocs . contains ( u ) ) files . put ( u , false ) ;
for ( String u : loadingLinks ) if ( u . startsWith ( path ) & & ! storedDocs . contains ( u ) ) files . put ( u , false ) ;
Map < String , StoreType > files = new HashMap < String , StoreType > ( ) ;
for ( String u : storedDocs ) files . put ( u , StoreType . INDEX ) ;
for ( String u : errorDocs . keySet ( ) ) files . put ( u , StoreType . ERROR ) ;
for ( String u : inboundLinks ) if ( ! storedDocs . contains ( u ) ) files . put ( u , StoreType . LINK ) ;
for ( String u : loadingLinks ) if ( u . startsWith ( path ) & & ! storedDocs . contains ( u ) ) files . put ( u , StoreType . LINK ) ;
Log . logInfo ( "HostBrowser" , "collected " + files . size ( ) + " urls for path " + path ) ;
// distinguish files and folders
Map < String , Object > list = new TreeMap < String , Object > ( ) ; // a directory list; if object is boolean, its a file; if its a int[], then its a folder
int pl = path . length ( ) ;
String file ;
boolean loaded ;
for ( Map . Entry < String , Boolean > entry : files . entrySet ( ) ) {
for ( Map . Entry < String , StoreType > entry : files . entrySet ( ) ) {
if ( entry . getKey ( ) . length ( ) < pl ) continue ; // this is not inside the path
if ( ! entry . getKey ( ) . startsWith ( path ) ) continue ;
file = entry . getKey ( ) . substring ( pl ) ;
loaded = entry . getValue ( ) . boolean Value( ) ;
StoreType type = entry . get Value( ) ;
p = file . indexOf ( '/' ) ;
if ( p < 0 ) {
// this is a file
list . put ( entry . getKey ( ) , loaded) ; // Boolean value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
list . put ( entry . getKey ( ) , type) ; // StoreType value: this is a file; true -> file is in index; false -> not in index, maybe in crawler
} else {
// this is a directory path or a file in a subdirectory
String remainingPath = file . substring ( 0 , p + 1 ) ;
if ( complete & & remainingPath . indexOf ( '.' ) > 0 ) {
list . put ( entry . getKey ( ) , loaded) ; // Boolean value: this is a file
list . put ( entry . getKey ( ) , type) ; // StoreType value: this is a file
} else {
String dir = path + remainingPath ;
Object c = list . get ( dir ) ;
boolean incrawler = loadingLinks . contains ( entry . getKey ( ) ) ;
if ( c = = null ) {
int [ ] linkedStored = new int [ ] { 0 , 0 , 0 } ;
linkedStored [ loaded ? 1 : 0 ] + + ;
if ( incrawler ) linkedStored [ 2 ] + + ;
list . put ( dir , linkedStored ) ;
int [ ] linkedStoredIncrawlerError = new int [ ] { 0 , 0 , 0 , 0 } ;
if ( type = = StoreType . LINK ) linkedStoredIncrawlerError [ 0 ] + + ;
if ( type = = StoreType . INDEX ) linkedStoredIncrawlerError [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ 2 ] + + ;
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) linkedStoredIncrawlerError [ 3 ] + + ;
list . put ( dir , linkedStoredIncrawlerError ) ;
} else if ( c instanceof int [ ] ) {
( ( int [ ] ) c ) [ loaded ? 1 : 0 ] + + ;
if ( incrawler ) ( ( int [ ] ) c ) [ 2 ] + + ;
if ( type = = StoreType . LINK ) ( ( int [ ] ) c ) [ 0 ] + + ;
if ( type = = StoreType . INDEX ) ( ( int [ ] ) c ) [ 1 ] + + ;
if ( loadingLinks . contains ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ 2 ] + + ;
if ( errorDocs . containsKey ( entry . getKey ( ) ) ) ( ( int [ ] ) c ) [ 3 ] + + ;
}
}
}
@ -300,16 +321,17 @@ public class HostBrowser {
int maxcount = 1000 ;
int c = 0 ;
for ( Map . Entry < String , Object > entry : list . entrySet ( ) ) {
if ( entry . getValue ( ) instanceof Boolean ) {
if ( entry . getValue ( ) instanceof StoreType ) {
// this is a file
prop . put ( "files_list_" + c + "_type" , 0 ) ;
prop . put ( "files_list_" + c + "_type_url" , entry . getKey ( ) ) ;
boolean indexed = ( ( Boolean ) entry . getValue ( ) ) . boolean Value( ) ;
StoreType type = ( StoreType ) entry . get Value( ) ;
try { uri = new DigestURI ( entry . getKey ( ) ) ; } catch ( MalformedURLException e ) { uri = null ; }
boolean loading = load . equals ( entry . getKey ( ) ) | | ( uri ! = null & & sb . crawlQueues . urlExists ( uri . hash ( ) ) ! = null ) ;
//String failr = fulltext.failReason(ASCII.String(uri.hash()));
prop . put ( "files_list_" + c + "_type_stored" , indexed ? 1 : loading ? 2 : 0 ) ;
prop . put ( "files_list_" + c + "_type_stored" , type = = StoreType . INDEX ? 1 : loading ? 2 : type = = StoreType . ERROR ? 3 : 0 /*linked*/ ) ;
prop . put ( "files_list_" + c + "_type_stored_load" , loadRight ? 1 : 0 ) ;
if ( type = = StoreType . ERROR ) prop . put ( "files_list_" + c + "_type_stored_error" , errorDocs . get ( entry . getKey ( ) ) ) ;
if ( loadRight ) {
prop . put ( "files_list_" + c + "_type_stored_load_url" , entry . getKey ( ) ) ;
prop . put ( "files_list_" + c + "_type_stored_load_path" , path ) ;
@ -321,7 +343,8 @@ public class HostBrowser {
int linked = ( ( int [ ] ) entry . getValue ( ) ) [ 0 ] ;
int stored = ( ( int [ ] ) entry . getValue ( ) ) [ 1 ] ;
int crawler = ( ( int [ ] ) entry . getValue ( ) ) [ 2 ] ;
prop . put ( "files_list_" + c + "_type_count" , stored + " stored / " + linked + " linked" + ( crawler > 0 ? ( " / " + crawler + " pending" ) : "" ) ) ;
int error = ( ( int [ ] ) entry . getValue ( ) ) [ 3 ] ;
prop . put ( "files_list_" + c + "_type_count" , stored + " stored / " + linked + " linked" + ( crawler > 0 ? ( " / " + crawler + " pending" ) : "" ) + ( error > 0 ? ( " / " + error + " errors" ) : "" ) ) ;
}
if ( + + c > = maxcount ) break ;
}