/ * *
* HyperlinkGraph
* Copyright 2014 by Michael Peter Christen
* First released 08.04 .2014 at http : //yacy.net
*
* This library is free software ; you can redistribute it and / or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation ; either
* version 2.1 of the License , or ( at your option ) any later version .
*
* This library is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the GNU
* Lesser General Public License for more details .
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21 . txt
* If not , see < http : //www.gnu.org/licenses/>.
* /
package net.yacy.search.schema ;
import java.net.MalformedURLException ;
import java.util.HashMap ;
import java.util.HashSet ;
import java.util.Iterator ;
import java.util.Map ;
import java.util.Set ;
import java.util.concurrent.BlockingQueue ;
import net.yacy.cora.document.id.DigestURL ;
import net.yacy.cora.document.id.MultiProtocolURL ;
import net.yacy.cora.federate.solr.FailType ;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector ;
import net.yacy.cora.federate.solr.connector.SolrConnector ;
import net.yacy.cora.util.ConcurrentLog ;
import net.yacy.kelondro.data.meta.URIMetadataNode ;
import net.yacy.search.index.Segment ;
import org.apache.solr.common.SolrDocument ;
public class HyperlinkGraph implements Iterable < HyperlinkEdge > {
public final static Set < String > ROOTFNS = new HashSet < String > ( ) ;
static {
for ( String s : new String [ ] { "/" , "/index.htm" , "/index.html" , "/index.php" , "/home.htm" , "/home.html" , "/home.php" , "/default.htm" , "/default.html" , "/default.php" } ) {
ROOTFNS . add ( s ) ;
}
}
HyperlinkEdges edges ;
String hostname ;
public HyperlinkGraph ( ) {
this . edges = new HyperlinkEdges ( ) ;
this . hostname = null ;
}
public void fill ( final SolrConnector solrConnector , String hostname , final DigestURL stopURL , final long maxtime , final int maxnodes ) {
this . hostname = hostname ;
if ( hostname . startsWith ( "www." ) ) hostname = hostname . substring ( 4 ) ;
StringBuilder q = new StringBuilder ( ) ;
q . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( hostname ) . append ( " OR " ) . append ( CollectionSchema . host_s . getSolrFieldName ( ) ) . append ( ':' ) . append ( "www." ) . append ( hostname ) ;
BlockingQueue < SolrDocument > docs = solrConnector . concurrentDocumentsByQuery ( q . toString ( ) , CollectionSchema . url_chars_i . getSolrFieldName ( ) + " asc" , 0 , maxnodes , maxtime , 100 , 1 , true ,
CollectionSchema . id . getSolrFieldName ( ) ,
CollectionSchema . sku . getSolrFieldName ( ) ,
CollectionSchema . failreason_s . getSolrFieldName ( ) ,
CollectionSchema . failtype_s . getSolrFieldName ( ) ,
CollectionSchema . inboundlinks_protocol_sxt . getSolrFieldName ( ) ,
CollectionSchema . inboundlinks_urlstub_sxt . getSolrFieldName ( ) ,
CollectionSchema . outboundlinks_protocol_sxt . getSolrFieldName ( ) ,
CollectionSchema . outboundlinks_urlstub_sxt . getSolrFieldName ( )
) ;
SolrDocument doc ;
Map < String , FailType > errorDocs = new HashMap < String , FailType > ( ) ;
HyperlinkEdges inboundEdges = new HyperlinkEdges ( ) ;
HyperlinkEdges outboundEdges = new HyperlinkEdges ( ) ;
HyperlinkEdges errorEdges = new HyperlinkEdges ( ) ;
try {
retrieval : while ( ( doc = docs . take ( ) ) ! = AbstractSolrConnector . POISON_DOCUMENT ) {
String u = ( String ) doc . getFieldValue ( CollectionSchema . sku . getSolrFieldName ( ) ) ;
MultiProtocolURL from = new MultiProtocolURL ( u ) ;
String errortype = ( String ) doc . getFieldValue ( CollectionSchema . failtype_s . getSolrFieldName ( ) ) ;
FailType error = errortype = = null ? null : FailType . valueOf ( errortype ) ;
if ( error ! = null ) {
errorDocs . put ( u , error ) ;
} else {
Iterator < String > links = URIMetadataNode . getLinks ( doc , true ) ; // inbound
String link ;
while ( links . hasNext ( ) ) {
link = links . next ( ) ;
try {
HyperlinkEdge . Target linkurl = new HyperlinkEdge . Target ( link , HyperlinkType . Inbound ) ;
inboundEdges . addEdge ( from , linkurl ) ;
if ( stopURL ! = null & & linkurl . equals ( stopURL ) ) break retrieval ;
} catch ( MalformedURLException e ) { }
}
links = URIMetadataNode . getLinks ( doc , false ) ; // outbound
while ( links . hasNext ( ) ) {
link = links . next ( ) ;
try {
HyperlinkEdge . Target linkurl = new HyperlinkEdge . Target ( link , HyperlinkType . Outbound ) ;
outboundEdges . addEdge ( from , linkurl ) ;
if ( stopURL ! = null & & linkurl . equals ( stopURL ) ) break retrieval ;
} catch ( MalformedURLException e ) { }
}
}
if ( inboundEdges . size ( ) + outboundEdges . size ( ) > maxnodes ) {
break retrieval ;
}
}
} catch ( InterruptedException e ) {
} catch ( MalformedURLException e ) {
}
// we use the errorDocs to mark all edges with endpoint to error documents
Iterator < HyperlinkEdge > i = inboundEdges . iterator ( ) ;
HyperlinkEdge edge ;
while ( i . hasNext ( ) ) {
edge = i . next ( ) ;
if ( errorDocs . containsKey ( edge . target . toNormalform ( true ) ) ) {
i . remove ( ) ;
edge . target . type = HyperlinkType . Dead ;
errorEdges . add ( edge ) ;
}
}
i = outboundEdges . iterator ( ) ;
while ( i . hasNext ( ) ) {
edge = i . next ( ) ;
if ( errorDocs . containsKey ( edge . target . toNormalform ( true ) ) ) {
i . remove ( ) ;
edge . target . type = HyperlinkType . Dead ;
errorEdges . add ( edge ) ;
}
}
// we put all edges together in a specific order which is used to create nodes in a svg display:
// notes that appear first are possible painted over by nodes coming later.
// less important nodes shall appear therefore first
this . edges . addAll ( outboundEdges ) ;
this . edges . addAll ( inboundEdges ) ;
this . edges . addAll ( errorEdges ) ;
}
public void path ( final Segment segment , DigestURL from , DigestURL to , final int maxtime , final int maxnodes ) {
// two steps to find the graph: (1) create a HyperlinkGraph (to-down) and (2) backtrack backlinks up to an element of the graph (bottom-up)
if ( this . edges . size ( ) = = 0 ) {
fill ( segment . fulltext ( ) . getDefaultConnector ( ) , from = = null ? to . getHost ( ) : from . getHost ( ) , to , maxtime , maxnodes ) ;
}
if ( getDepth ( to ) > = 0 & & ( from = = null | | getDepth ( from ) > = 0 ) ) return ; // nothing to do.
// now find the link bottom-up
}
public int findLinkDepth ( ) {
int remaining = this . edges . size ( ) ;
// first find root nodes
Set < MultiProtocolURL > nodes = new HashSet < MultiProtocolURL > ( ) ;
Set < MultiProtocolURL > nextnodes = new HashSet < MultiProtocolURL > ( ) ;
for ( HyperlinkEdge edge : this . edges ) {
String path = edge . source . getPath ( ) ;
if ( ROOTFNS . contains ( path ) ) {
this . edges . updateDepth ( edge . source , 0 ) ;
if ( edge . target . type = = HyperlinkType . Inbound ) this . edges . updateDepth ( edge . target , 1 ) ;
nodes . add ( edge . source ) ;
nextnodes . add ( edge . target ) ;
remaining - - ;
}
}
if ( nodes . size ( ) = = 0 & & this . edges . size ( ) > 0 ) {
ConcurrentLog . warn ( "HyperlinkGraph" , "could not find a root node for " + hostname + " in " + this . edges . size ( ) + " edges" ) ;
}
// add virtual nodes
for ( String rootpath : ROOTFNS ) {
try {
this . edges . updateDepth ( new DigestURL ( "http://" + hostname + rootpath ) , 0 ) ;
} catch ( MalformedURLException e ) { }
}
// recursively step into depth and find next level
int depth = 1 ;
while ( remaining > 0 ) {
boolean found = false ;
nodes = nextnodes ;
nextnodes = new HashSet < MultiProtocolURL > ( ) ;
for ( HyperlinkEdge edge : this . edges ) {
if ( nodes . contains ( edge . source ) ) {
this . edges . updateDepth ( edge . source , depth ) ;
if ( edge . target . type = = HyperlinkType . Inbound ) this . edges . updateDepth ( edge . target , depth + 1 ) ;
nextnodes . add ( edge . target ) ;
remaining - - ;
found = true ;
}
}
depth + + ;
if ( ! found ) break ; // terminating in case that not all edges are linked together
}
if ( remaining > 0 ) ConcurrentLog . warn ( "HyperlinkGraph" , "could not find all edges for " + hostname + ", " + remaining + " remaining." ) ;
return depth ;
}
public Integer getDepth ( MultiProtocolURL url ) {
return this . edges . getDepth ( url ) ;
}
@Override
public Iterator < HyperlinkEdge > iterator ( ) {
return this . edges . iterator ( ) ;
}
}