@ -1570,14 +1570,31 @@ public final class Switchboard extends serverSwitch {
return false ;
return false ;
}
}
/ * *
* tests if hash occurs in any database .
* @param hash
* @return if it exists , the name of the database is returned , if it not exists , null is returned
* /
@Deprecated
public HarvestProcess urlExists ( final String hash ) {
public HarvestProcess urlExists ( final String hash ) {
// tests if hash occurrs in any database
// if it exists, the name of the database is returned,
// if it not exists, null is returned
if ( this . index . exists ( hash ) ) return HarvestProcess . LOADED ;
if ( this . index . exists ( hash ) ) return HarvestProcess . LOADED ;
return this . crawlQueues . urlExists ( ASCII . getBytes ( hash ) ) ;
return this . crawlQueues . urlExists ( ASCII . getBytes ( hash ) ) ;
}
}
/ * *
* tests if hashes occur in any database .
* @param ids a collection of url hashes
* @return a map from the hash id to : if it exists , the name of the database , otherwise null
* /
public Map < String , HarvestProcess > urlExists ( final Collection < String > ids ) {
Set < String > e = this . index . exists ( ids ) ;
Map < String , HarvestProcess > m = new HashMap < String , HarvestProcess > ( ) ;
for ( String id : ids ) {
m . put ( id , e . contains ( id ) ? HarvestProcess . LOADED : this . crawlQueues . urlExists ( ASCII . getBytes ( id ) ) ) ;
}
return m ;
}
public void urlRemove ( final Segment segment , final byte [ ] hash ) {
public void urlRemove ( final Segment segment , final byte [ ] hash ) {
segment . fulltext ( ) . remove ( hash ) ;
segment . fulltext ( ) . remove ( hash ) ;
ResultURLs . remove ( ASCII . String ( hash ) ) ;
ResultURLs . remove ( ASCII . String ( hash ) ) ;
@ -2768,36 +2785,25 @@ public final class Switchboard extends serverSwitch {
final String heuristicName ,
final String heuristicName ,
final String [ ] collections ) {
final String [ ] collections ) {
List < DigestURI > urls = new ArrayList < DigestURI > ( ) ;
// add the landing page to the index. should not load that again since it should be in the cache
// add the landing page to the index. should not load that again since it should be in the cache
if ( url ! = null ) {
if ( url ! = null ) {
try {
urls . add ( url ) ;
addToIndex ( url , searchEvent , heuristicName , collections ) ;
} catch ( final IOException e ) {
} catch ( final Parser . Failure e ) {
}
}
}
// check if some of the links match with the query
// check if some of the links match with the query
final Map < DigestURI , String > matcher = searchEvent . query . separateMatches ( links ) ;
final Map < DigestURI , String > matcher = searchEvent . query . separateMatches ( links ) ;
// take the matcher and load them all
// take the matcher and load them all
for ( final Map . Entry < DigestURI , String > entry : matcher . entrySet ( ) ) {
for ( final Map . Entry < DigestURI , String > entry : matcher . entrySet ( ) ) {
try {
urls . add ( new DigestURI ( entry . getKey ( ) , ( byte [ ] ) null ) ) ;
addToIndex ( new DigestURI ( entry . getKey ( ) , ( byte [ ] ) null ) , searchEvent , heuristicName , collections ) ;
} catch ( final IOException e ) {
} catch ( final Parser . Failure e ) {
}
}
}
// take then the no-matcher and load them also
// take then the no-matcher and load them also
for ( final Map . Entry < DigestURI , String > entry : links . entrySet ( ) ) {
for ( final Map . Entry < DigestURI , String > entry : links . entrySet ( ) ) {
try {
urls . add ( new DigestURI ( entry . getKey ( ) , ( byte [ ] ) null ) ) ;
addToIndex ( new DigestURI ( entry . getKey ( ) , ( byte [ ] ) null ) , searchEvent , heuristicName , collections ) ;
} catch ( final IOException e ) {
} catch ( final Parser . Failure e ) {
}
}
}
addToIndex ( urls , searchEvent , heuristicName , collections ) ;
}
}
public void remove ( final Collection < String > deleteIDs ) {
public void remove ( final Collection < String > deleteIDs ) {
@ -2837,6 +2843,7 @@ public final class Switchboard extends serverSwitch {
* @param url
* @param url
* @return null if this was ok . If this failed , return a string with a fail reason
* @return null if this was ok . If this failed , return a string with a fail reason
* /
* /
@SuppressWarnings ( "deprecation" )
public String stackUrl ( CrawlProfile profile , DigestURI url ) {
public String stackUrl ( CrawlProfile profile , DigestURI url ) {
byte [ ] handle = ASCII . getBytes ( profile . handle ( ) ) ;
byte [ ] handle = ASCII . getBytes ( profile . handle ( ) ) ;
@ -2946,73 +2953,72 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws IOException
* @throws Parser . Failure
* @throws Parser . Failure
* /
* /
public void addToIndex ( final DigestURI url , final SearchEvent searchEvent , final String heuristicName , final String [ ] collections )
public void addToIndex ( final Collection< DigestURI> url s , final SearchEvent searchEvent , final String heuristicName , final String [ ] collections ) {
throws IOException ,
Map < String , DigestURI > urlmap = new HashMap < String , DigestURI > ( ) ;
Parser . Failure {
for ( DigestURI url : urls ) urlmap . put ( ASCII . String ( url . hash ( ) ) , url ) ;
if ( searchEvent ! = null ) {
if ( searchEvent ! = null ) {
searchEvent . addHeuristic ( url . hash ( ) , heuristicName , true ) ;
for ( String id : urlmap . keySet ( ) ) searchEvent . addHeuristic ( ASCII . getBytes ( id ) , heuristicName , true ) ;
}
}
if ( this . index . exists ( ASCII . String ( url . hash ( ) ) ) ) {
final Set < String > existing = this . index . exists ( urlmap . keySet ( ) ) ;
return ; // don't do double-work
final List < Request > requests = new ArrayList < Request > ( ) ;
}
for ( Map . Entry < String , DigestURI > e : urlmap . entrySet ( ) ) {
final Request request = this . loader . request ( url , true , true ) ;
final String urlName = e . getValue ( ) . toNormalform ( true ) ;
final CrawlProfile profile = this . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
if ( existing . contains ( e . getKey ( ) ) ) {
final String acceptedError = this . crawlStacker . checkAcceptance ( url , profile , 0 ) ;
this . log . logInfo ( "addToIndex: double " + urlName ) ;
final String urls = url . toNormalform ( true ) ;
continue ;
if ( acceptedError ! = null ) {
}
this . log . logWarning ( "addToIndex: cannot load "
final Request request = this . loader . request ( e . getValue ( ) , true , true ) ;
+ urls
final CrawlProfile profile = this . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
+ ": "
final String acceptedError = this . crawlStacker . checkAcceptance ( e . getValue ( ) , profile , 0 ) ;
+ acceptedError ) ;
if ( acceptedError ! = null ) {
return ;
this . log . logWarning ( "addToIndex: cannot load " + urlName + ": " + acceptedError ) ;
continue ;
}
requests . add ( request ) ;
}
}
new Thread ( ) {
new Thread ( ) {
@Override
@Override
public void run ( ) {
public void run ( ) {
Thread . currentThread ( ) . setName ( "Switchboard.addToIndex:" + urls ) ;
for ( Request request : requests ) {
try {
DigestURI url = request . url ( ) ;
final Response response =
String urlName = url . toNormalform ( true ) ;
Switchboard . this . loader . load ( request , CacheStrategy . IFFRESH , BlacklistType . CRAWLER , CrawlQueues . queuedMinLoadDelay ) ;
Thread . currentThread ( ) . setName ( "Switchboard.addToIndex:" + urlName ) ;
if ( response = = null ) {
try {
throw new IOException ( "response == null" ) ;
final Response response = Switchboard . this . loader . load ( request , CacheStrategy . IFFRESH , BlacklistType . CRAWLER , CrawlQueues . queuedMinLoadDelay ) ;
}
if ( response = = null ) {
if ( response . getContent ( ) = = null ) {
throw new IOException ( "response == null" ) ;
throw new IOException ( "content == null" ) ;
}
}
if ( response . getContent ( ) = = null ) {
if ( response . getResponseHeader ( ) = = null ) {
throw new IOException ( "content == null" ) ;
throw new IOException ( "header == null" ) ;
}
}
if ( response . getResponseHeader ( ) = = null ) {
final Document [ ] documents = response . parse ( ) ;
throw new IOException ( "header == null" ) ;
if ( documents ! = null ) {
}
for ( final Document document : documents ) {
final Document [ ] documents = response . parse ( ) ;
if ( document . indexingDenied ( ) ) {
if ( documents ! = null ) {
throw new Parser . Failure ( "indexing is denied" , url ) ;
for ( final Document document : documents ) {
if ( document . indexingDenied ( ) ) {
throw new Parser . Failure ( "indexing is denied" , url ) ;
}
final Condenser condenser = new Condenser ( document , true , true , LibraryProvider . dymLib , LibraryProvider . synonyms , true ) ;
ResultImages . registerImages ( url , document , true ) ;
Switchboard . this . webStructure . generateCitationReference ( url , document ) ;
storeDocumentIndex (
response ,
collections ,
document ,
condenser ,
searchEvent ,
"heuristic:" + heuristicName ) ;
Switchboard . this . log . logInfo ( "addToIndex fill of url " + urlName + " finished" ) ;
}
}
final Condenser condenser = new Condenser ( document , true , true , LibraryProvider . dymLib , LibraryProvider . synonyms , true ) ;
ResultImages . registerImages ( url , document , true ) ;
Switchboard . this . webStructure . generateCitationReference ( url , document ) ;
storeDocumentIndex (
response ,
collections ,
document ,
condenser ,
searchEvent ,
"heuristic:" + heuristicName ) ;
Switchboard . this . log . logInfo ( "addToIndex fill of url "
+ url . toNormalform ( true )
+ " finished" ) ;
}
}
} catch ( final IOException e ) {
Switchboard . this . log . logWarning ( "addToIndex: failed loading " + urlName + ": " + e . getMessage ( ) ) ;
} catch ( final Parser . Failure e ) {
Switchboard . this . log . logWarning ( "addToIndex: failed parsing " + urlName + ": " + e . getMessage ( ) ) ;
}
}
} catch ( final IOException e ) {
Switchboard . this . log . logWarning ( "addToIndex: failed loading "
+ url . toNormalform ( true )
+ ": "
+ e . getMessage ( ) ) ;
} catch ( final Parser . Failure e ) {
Switchboard . this . log . logWarning ( "addToIndex: failed parsing "
+ url . toNormalform ( true )
+ ": "
+ e . getMessage ( ) ) ;
}
}
}
}
} . start ( ) ;
} . start ( ) ;
@ -3026,33 +3032,30 @@ public final class Switchboard extends serverSwitch {
* @param url the url that shall be indexed
* @param url the url that shall be indexed
* @param asglobal true adds the url to global crawl queue ( for remote crawling ) , false to the local crawler
* @param asglobal true adds the url to global crawl queue ( for remote crawling ) , false to the local crawler
* /
* /
public void addToCrawler ( final DigestURI url , final boolean asglobal ) {
public void addToCrawler ( final Collection < DigestURI > urls , final boolean asglobal ) {
Map < String , DigestURI > urlmap = new HashMap < String , DigestURI > ( ) ;
if ( this . index . exists ( ASCII . String ( url . hash ( ) ) ) ) {
for ( DigestURI url : urls ) urlmap . put ( ASCII . String ( url . hash ( ) ) , url ) ;
return ; // don't do double-work
Set < String > existingids = this . index . exists ( urlmap . keySet ( ) ) ;
}
for ( Map . Entry < String , DigestURI > e : urlmap . entrySet ( ) ) {
final Request request = this . loader . request ( url , true , true ) ;
if ( existingids . contains ( e . getKey ( ) ) ) continue ; // double
final CrawlProfile profile = this . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
DigestURI url = e . getValue ( ) ;
final String acceptedError = this . crawlStacker . checkAcceptance ( url , profile , 0 ) ;
final Request request = this . loader . request ( url , true , true ) ;
if ( acceptedError ! = null ) {
final CrawlProfile profile = this . crawler . getActive ( ASCII . getBytes ( request . profileHandle ( ) ) ) ;
this . log . logInfo ( "addToCrawler: cannot load "
final String acceptedError = this . crawlStacker . checkAcceptance ( url , profile , 0 ) ;
+ url . toNormalform ( true )
if ( acceptedError ! = null ) {
+ ": "
this . log . logInfo ( "addToCrawler: cannot load " + url . toNormalform ( true ) + ": " + acceptedError ) ;
+ acceptedError ) ;
return ;
return ;
}
}
final String s ;
final String s ;
if ( asglobal ) {
if ( asglobal ) {
s = this . crawlQueues . noticeURL . push ( StackType . GLOBAL , request , this . robots ) ;
s = this . crawlQueues . noticeURL . push ( StackType . GLOBAL , request , this . robots ) ;
} else {
} else {
s = this . crawlQueues . noticeURL . push ( StackType . LOCAL , request , this . robots ) ;
s = this . crawlQueues . noticeURL . push ( StackType . LOCAL , request , this . robots ) ;
}
}
if ( s ! = null ) {
if ( s ! = null ) {
Switchboard . this . log . logInfo ( "addToCrawler: failed to add " + url . toNormalform ( true ) + ": " + s ) ;
Switchboard . this . log . logInfo ( "addToCrawler: failed to add "
}
+ url . toNormalform ( true )
+ ": "
+ s ) ;
}
}
}
}
@ -3413,14 +3416,16 @@ public final class Switchboard extends serverSwitch {
if ( links . size ( ) < 1000 ) { // limit to 1000 to skip large index pages
if ( links . size ( ) < 1000 ) { // limit to 1000 to skip large index pages
final Iterator < DigestURI > i = links . keySet ( ) . iterator ( ) ;
final Iterator < DigestURI > i = links . keySet ( ) . iterator ( ) ;
final boolean globalcrawljob = Switchboard . this . getConfigBool ( "heuristic.searchresults.crawlglobal" , false ) ;
final boolean globalcrawljob = Switchboard . this . getConfigBool ( "heuristic.searchresults.crawlglobal" , false ) ;
Collection < DigestURI > urls = new ArrayList < DigestURI > ( ) ;
while ( i . hasNext ( ) ) {
while ( i . hasNext ( ) ) {
url = i . next ( ) ;
url = i . next ( ) ;
boolean islocal = url . getHost ( ) . contentEquals ( startUrl . getHost ( ) ) ;
boolean islocal = url . getHost ( ) . contentEquals ( startUrl . getHost ( ) ) ;
// add all external links or links to different page to crawler
// add all external links or links to different page to crawler
if ( ! islocal ) { // || (!startUrl.getPath().endsWith(url.getPath()))) {
if ( ! islocal ) { // || (!startUrl.getPath().endsWith(url.getPath()))) {
addToCrawler ( url , globalcrawljob ) ;
urls. add( url ) ;
}
}
}
}
addToCrawler ( urls , globalcrawljob ) ;
}
}
}
}
} catch ( final Throwable e ) {
} catch ( final Throwable e ) {