@ -98,14 +98,13 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content . Don ' t forget to close the stream .
* @throws IOException when an error occurred
* /
public StreamResponse openInputStream ( final Request request , CrawlProfile profile , final int retryCount ,
final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent )
throws IOException {
public StreamResponse openInputStream (
final Request request , CrawlProfile profile , final int retryCount ,
final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent
) throws IOException {
if ( retryCount < 0 ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . TEMPORARY_NETWORK_FAILURE , "retry counter exceeded" , - 1 ) ;
throw new IOException (
"retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "retry counter exceeded" , - 1 ) ;
throw new IOException ( "retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
}
DigestURL url = request . url ( ) ;
@ -158,8 +157,7 @@ public final class HTTPLoader {
if ( statusCode > 299 & & statusCode < 310 ) {
client . finish ( ) ;
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , statusline ,
responseHeader , requestURLString ) ;
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , statusline , responseHeader , requestURLString ) ;
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
// we have two use cases here: loading from a crawl or just
@ -196,15 +194,20 @@ public final class HTTPLoader {
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$" ) ;
}
// check if the redirected URL is the same as the requested URL
// this shortcuts a time-out using retryCount
if ( redirectionUrl . equals ( url ) ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirect to same url" , - 1 ) ;
throw new IOException ( "retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
}
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
return openInputStream ( request , profile , retryCount - 1 , maxFileSize , blacklistType , agent ) ;
}
// we don't want to follow redirects
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$" ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$" ) ;
} else if ( statusCode = = HttpStatus . SC_OK | | statusCode = = HttpStatus . SC_NON_AUTHORITATIVE_INFORMATION ) {
// the transfer is ok
@ -397,8 +400,6 @@ public final class HTTPLoader {
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason ) ;
}
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl . toNormalform ( false ) + " placed on crawler queue for double-check" ) ;
}
// if we are already doing a shutdown we don't need to retry crawling