@ -85,8 +85,8 @@ public final class HTTPLoader {
Latency . updateAfterLoad ( entry . url ( ) , System . currentTimeMillis ( ) - start ) ;
Latency . updateAfterLoad ( entry . url ( ) , System . currentTimeMillis ( ) - start ) ;
return doc ;
return doc ;
}
}
/ * *
/ * *
* Open an input stream on a requested HTTP resource . When the resource content size is small
* Open an input stream on a requested HTTP resource . When the resource content size is small
* ( lower than { @link Response # CRAWLER_MAX_SIZE_TO_CACHE } , fully load it and use a ByteArrayInputStream instance .
* ( lower than { @link Response # CRAWLER_MAX_SIZE_TO_CACHE } , fully load it and use a ByteArrayInputStream instance .
* @param request
* @param request
@ -98,228 +98,231 @@ public final class HTTPLoader {
* @return a response with full meta data and embedding on open input stream on content . Don ' t forget to close the stream .
* @return a response with full meta data and embedding on open input stream on content . Don ' t forget to close the stream .
* @throws IOException when an error occurred
* @throws IOException when an error occurred
* /
* /
public StreamResponse openInputStream ( final Request request , CrawlProfile profile , final int retryCount ,
public StreamResponse openInputStream (
final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent )
final Request request , CrawlProfile profile , final int retryCount ,
throws IOException {
final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent
if ( retryCount < 0 ) {
) throws IOException {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
if ( retryCount < 0 ) {
FailCategory . TEMPORARY_NETWORK_FAILURE , "retry counter exceeded" , - 1 ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "retry counter exceeded" , - 1 ) ;
throw new IOException (
throw new IOException ( "retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
"retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
}
}
DigestURL url = request . url ( ) ;
DigestURL url = request . url ( ) ;
final String host = url . getHost ( ) ;
final String host = url . getHost ( ) ;
if ( host = = null | | host . length ( ) < 2 ) {
if ( host = = null | | host . length ( ) < 2 ) {
throw new IOException ( "host is not well-formed: '" + host + "'" ) ;
throw new IOException ( "host is not well-formed: '" + host + "'" ) ;
}
}
final String path = url . getFile ( ) ;
final String path = url . getFile ( ) ;
int port = url . getPort ( ) ;
int port = url . getPort ( ) ;
final boolean ssl = url . getProtocol ( ) . equals ( "https" ) ;
final boolean ssl = url . getProtocol ( ) . equals ( "https" ) ;
if ( port < 0 )
if ( port < 0 )
port = ( ssl ) ? 443 : 80 ;
port = ( ssl ) ? 443 : 80 ;
// check if url is in blacklist
// check if url is in blacklist
final String hostlow = host . toLowerCase ( Locale . ROOT ) ;
final String hostlow = host . toLowerCase ( Locale . ROOT ) ;
if ( blacklistType ! = null & & Switchboard . urlBlacklist . isListed ( blacklistType , hostlow , path ) ) {
if ( blacklistType ! = null & & Switchboard . urlBlacklist . isListed ( blacklistType , hostlow , path ) ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_LOAD_CONTEXT ,
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_LOAD_CONTEXT ,
"url in blacklist" , - 1 ) ;
"url in blacklist" , - 1 ) ;
throw new IOException ( "CRAWLER Rejecting URL '" + request . url ( ) . toString ( ) + "'. URL is in blacklist.$" ) ;
throw new IOException ( "CRAWLER Rejecting URL '" + request . url ( ) . toString ( ) + "'. URL is in blacklist.$" ) ;
}
}
// resolve yacy and yacyh domains
// resolve yacy and yacyh domains
final AlternativeDomainNames yacyResolver = this . sb . peers ;
final AlternativeDomainNames yacyResolver = this . sb . peers ;
if ( yacyResolver ! = null ) {
if ( yacyResolver ! = null ) {
final String yAddress = yacyResolver . resolve ( host ) ;
final String yAddress = yacyResolver . resolve ( host ) ;
if ( yAddress ! = null ) {
if ( yAddress ! = null ) {
url = new DigestURL ( url . getProtocol ( ) + "://" + yAddress + path ) ;
url = new DigestURL ( url . getProtocol ( ) + "://" + yAddress + path ) ;
}
}
}
}
// create a request header
// create a request header
final RequestHeader requestHeader = createRequestheader ( request , agent ) ;
final RequestHeader requestHeader = createRequestheader ( request , agent ) ;
// HTTP-Client
// HTTP-Client
final HTTPClient client = new HTTPClient ( agent ) ;
final HTTPClient client = new HTTPClient ( agent ) ;
client . setRedirecting ( false ) ; // we want to handle redirection
client . setRedirecting ( false ) ; // we want to handle redirection
// ourselves, so we don't index pages
// ourselves, so we don't index pages
// twice
// twice
client . setTimout ( this . socketTimeout ) ;
client . setTimout ( this . socketTimeout ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
// send request
// send request
client . GET ( url , false ) ;
client . GET ( url , false ) ;
final StatusLine statusline = client . getHttpResponse ( ) . getStatusLine ( ) ;
final StatusLine statusline = client . getHttpResponse ( ) . getStatusLine ( ) ;
final int statusCode = statusline . getStatusCode ( ) ;
final int statusCode = statusline . getStatusCode ( ) ;
final ResponseHeader responseHeader = new ResponseHeader ( statusCode , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
final ResponseHeader responseHeader = new ResponseHeader ( statusCode , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
String requestURLString = request . url ( ) . toNormalform ( true ) ;
String requestURLString = request . url ( ) . toNormalform ( true ) ;
// check redirection
// check redirection
if ( statusCode > 299 & & statusCode < 310 ) {
if ( statusCode > 299 & & statusCode < 310 ) {
client . finish ( ) ;
client . finish ( ) ;
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , statusline , responseHeader , requestURLString ) ;
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , statusline ,
responseHeader , requestURLString ) ;
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
// we have two use cases here: loading from a crawl or just
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
// loading the url. Check this:
// we have two use cases here: loading from a crawl or just
if ( profile ! = null & & ! CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) {
// loading the url. Check this:
// put redirect url on the crawler queue to repeat a
if ( profile ! = null & & ! CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) {
// double-check
// put redirect url on the crawler queue to repeat a
/ * We have to clone the request instance and not to modify directly its URL ,
// double-check
* otherwise the stackCrawl ( ) function would reject it , because detecting it as already in the activeWorkerEntries * /
/ * We have to clone the request instance and not to modify directly its URL ,
* otherwise the stackCrawl ( ) function would reject it , because detecting it as already in the activeWorkerEntries * /
Request redirectedRequest = new Request ( request . initiator ( ) ,
Request redirectedRequest = new Request ( request . initiator ( ) ,
redirectionUrl ,
redirectionUrl ,
request . referrerhash ( ) ,
request . referrerhash ( ) ,
request . name ( ) ,
request . name ( ) ,
request . appdate ( ) ,
request . appdate ( ) ,
request . profileHandle ( ) ,
request . profileHandle ( ) ,
request . depth ( ) ,
request . depth ( ) ,
request . timezoneOffset ( ) ) ;
request . timezoneOffset ( ) ) ;
String rejectReason = this . sb . crawlStacker . stackCrawl ( redirectedRequest ) ;
String rejectReason = this . sb . crawlStacker . stackCrawl ( redirectedRequest ) ;
if ( rejectReason ! = null ) {
if ( rejectReason ! = null ) {
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason ) ;
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason ) ;
}
}
// in the end we must throw an exception (even if this is
// in the end we must throw an exception (even if this is
// not an error, just to abort the current process
// not an error, just to abort the current process
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " to "
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " to "
+ redirectionUrl . toNormalform ( false ) + " placed on crawler queue for double-check" ) ;
+ redirectionUrl . toNormalform ( false ) + " placed on crawler queue for double-check" ) ;
}
}
// if we are already doing a shutdown we don't need to retry
// if we are already doing a shutdown we don't need to retry
// crawling
// crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , statusCode ) ;
FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , statusCode ) ;
throw new IOException (
throw new IOException (
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$" ) ;
"CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$" ) ;
}
}
// retry crawling with new url
// check if the redirected URL is the same as the requested URL
request . redirectURL ( redirectionUrl ) ;
// this shortcuts a time-out using retryCount
return openInputStream ( request , profile , retryCount - 1 , maxFileSize , blacklistType , agent ) ;
if ( redirectionUrl . equals ( url ) ) {
}
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirect to same url" , - 1 ) ;
// we don't want to follow redirects
throw new IOException ( "retry counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted.$" ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
}
FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + statusline
// retry crawling with new url
+ "' for URL '" + requestURLString + "'$" ) ;
request . redirectURL ( redirectionUrl ) ;
} else if ( statusCode = = HttpStatus . SC_OK | | statusCode = = HttpStatus . SC_NON_AUTHORITATIVE_INFORMATION ) {
return openInputStream ( request , profile , retryCount - 1 , maxFileSize , blacklistType , agent ) ;
// the transfer is ok
}
// we don't want to follow redirects
/ *
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
* When content is not large ( less than Response . CRAWLER_MAX_SIZE_TO_CACHE ) , we have better cache it if cache is enabled and url is not local
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$" ) ;
* /
} else if ( statusCode = = HttpStatus . SC_OK | | statusCode = = HttpStatus . SC_NON_AUTHORITATIVE_INFORMATION ) {
long contentLength = client . getHttpResponse ( ) . getEntity ( ) . getContentLength ( ) ;
// the transfer is ok
InputStream contentStream ;
if ( profile ! = null & & profile . storeHTCache ( ) & & contentLength > 0 & & contentLength < ( Response . CRAWLER_MAX_SIZE_TO_CACHE ) & & ! url . isLocal ( ) ) {
/ *
byte [ ] content = null ;
* When content is not large ( less than Response . CRAWLER_MAX_SIZE_TO_CACHE ) , we have better cache it if cache is enabled and url is not local
try {
* /
content = HTTPClient . getByteArray ( client . getHttpResponse ( ) . getEntity ( ) , maxFileSize ) ;
long contentLength = client . getHttpResponse ( ) . getEntity ( ) . getContentLength ( ) ;
Cache . store ( url , responseHeader , content ) ;
InputStream contentStream ;
} catch ( final IOException e ) {
if ( profile ! = null & & profile . storeHTCache ( ) & & contentLength > 0 & & contentLength < ( Response . CRAWLER_MAX_SIZE_TO_CACHE ) & & ! url . isLocal ( ) ) {
this . log . warn ( "cannot write " + url + " to Cache (3): " + e . getMessage ( ) , e ) ;
byte [ ] content = null ;
} finally {
try {
client . finish ( ) ;
content = HTTPClient . getByteArray ( client . getHttpResponse ( ) . getEntity ( ) , maxFileSize ) ;
}
Cache . store ( url , responseHeader , content ) ;
} catch ( final IOException e ) {
contentStream = new ByteArrayInputStream ( content ) ;
this . log . warn ( "cannot write " + url + " to Cache (3): " + e . getMessage ( ) , e ) ;
} else {
} finally {
/ *
client . finish ( ) ;
* Content length may already be known now : check it before opening a stream
}
* /
if ( maxFileSize > = 0 & & contentLength > maxFileSize ) {
contentStream = new ByteArrayInputStream ( content ) ;
throw new IOException ( "Content to download exceed maximum value of " + maxFileSize + " bytes" ) ;
} else {
}
/ *
/ *
* Content length may already be known now : check it before opening a stream
* Create a HTTPInputStream delegating to
* /
* client . getContentstream ( ) . Close method will ensure client is
if ( maxFileSize > = 0 & & contentLength > maxFileSize ) {
* properly closed .
throw new IOException ( "Content to download exceed maximum value of " + maxFileSize + " bytes" ) ;
* /
}
contentStream = new HTTPInputStream ( client ) ;
/ *
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
* Create a HTTPInputStream delegating to
if ( maxFileSize > = 0 ) {
* client . getContentstream ( ) . Close method will ensure client is
contentStream = new StrictLimitInputStream ( contentStream , maxFileSize ,
* properly closed .
"Content to download exceed maximum value of " + Formatter . bytesToString ( maxFileSize ) ) ;
* /
}
contentStream = new HTTPInputStream ( client ) ;
}
/* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */
if ( maxFileSize > = 0 ) {
return new StreamResponse ( new Response ( request , requestHeader , responseHeader , profile , false , null ) , contentStream ) ;
contentStream = new StrictLimitInputStream ( contentStream , maxFileSize ,
} else {
"Content to download exceed maximum value of " + Formatter . bytesToString ( maxFileSize ) ) ;
client . finish ( ) ;
}
// if the response has not the right response type then reject file
}
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , statusCode ) ;
return new StreamResponse ( new Response ( request , requestHeader , responseHeader , profile , false , null ) , contentStream ) ;
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + statusline
} else {
+ "' for URL '" + requestURLString + "'$" ) ;
client . finish ( ) ;
}
// if the response has not the right response type then reject file
}
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , statusCode ) ;
/ * *
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + statusline
* Extract redirect URL from response header . Status code is supposed to be between 299 and 310. Parameters must not be null .
+ "' for URL '" + requestURLString + "'$" ) ;
* @return redirect URL
}
* @throws IOException when an error occured
}
* /
private DigestURL extractRedirectURL ( final Request request , CrawlProfile profile , DigestURL url ,
/ * *
final StatusLine statusline , final ResponseHeader responseHeader , String requestURLString )
* Extract redirect URL from response header . Status code is supposed to be between 299 and 310. Parameters must not be null .
throws IOException {
* @return redirect URL
// read redirection URL
* @throws IOException when an error occured
String redirectionUrlString = responseHeader . get ( HeaderFramework . LOCATION ) ;
* /
redirectionUrlString = redirectionUrlString = = null ? "" : redirectionUrlString . trim ( ) ;
private DigestURL extractRedirectURL ( final Request request , CrawlProfile profile , DigestURL url ,
final StatusLine statusline , final ResponseHeader responseHeader , String requestURLString )
if ( redirectionUrlString . isEmpty ( ) ) {
throws IOException {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
// read redirection URL
FailCategory . TEMPORARY_NETWORK_FAILURE ,
String redirectionUrlString = responseHeader . get ( HeaderFramework . LOCATION ) ;
"no redirection url provided, field '" + HeaderFramework . LOCATION + "' is empty" , statusline . getStatusCode ( ) ) ;
redirectionUrlString = redirectionUrlString = = null ? "" : redirectionUrlString . trim ( ) ;
throw new IOException ( "REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$" ) ;
if ( redirectionUrlString . isEmpty ( ) ) {
}
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
FailCategory . TEMPORARY_NETWORK_FAILURE ,
// normalize URL
"no redirection url provided, field '" + HeaderFramework . LOCATION + "' is empty" , statusline . getStatusCode ( ) ) ;
final DigestURL redirectionUrl = DigestURL . newURL ( request . url ( ) , redirectionUrlString ) ;
throw new IOException ( "REJECTED EMTPY REDIRECTION '" + statusline
+ "' for URL '" + requestURLString + "'$" ) ;
// restart crawling with new url
}
this . log . info ( "CRAWLER Redirection detected ('" + statusline + "') for URL "
+ requestURLString ) ;
// normalize URL
this . log . info ( "CRAWLER ..Redirecting request to: " + redirectionUrl . toNormalform ( false ) ) ;
final DigestURL redirectionUrl = DigestURL . newURL ( request . url ( ) , redirectionUrlString ) ;
this . sb . webStructure . generateCitationReference ( url , redirectionUrl ) ;
// restart crawling with new url
this . log . info ( "CRAWLER Redirection detected ('" + statusline + "') for URL "
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_RECORD_REDIRECTS , true ) ) {
+ requestURLString ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
this . log . info ( "CRAWLER ..Redirecting request to: " + redirectionUrl . toNormalform ( false ) ) ;
FailCategory . FINAL_REDIRECT_RULE , "redirect to " + redirectionUrlString , statusline . getStatusCode ( ) ) ;
}
this . sb . webStructure . generateCitationReference ( url , redirectionUrl ) ;
return redirectionUrl ;
}
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_RECORD_REDIRECTS , true ) ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile ,
/ * *
FailCategory . FINAL_REDIRECT_RULE , "redirect to " + redirectionUrlString , statusline . getStatusCode ( ) ) ;
* Create request header for loading content .
}
* @param request search request
return redirectionUrl ;
* @param agent agent identification information
}
* @return a request header
* @throws IOException when an error occured
/ * *
* /
* Create request header for loading content .
private RequestHeader createRequestheader ( final Request request , final ClientIdentification . Agent agent )
* @param request search request
throws IOException {
* @param agent agent identification information
final RequestHeader requestHeader = new RequestHeader ( ) ;
* @return a request header
requestHeader . put ( HeaderFramework . USER_AGENT , agent . userAgent ) ;
* @throws IOException when an error occured
if ( request . referrerhash ( ) ! = null ) {
* /
private RequestHeader createRequestheader ( final Request request , final ClientIdentification . Agent agent )
throws IOException {
final RequestHeader requestHeader = new RequestHeader ( ) ;
requestHeader . put ( HeaderFramework . USER_AGENT , agent . userAgent ) ;
if ( request . referrerhash ( ) ! = null ) {
DigestURL refererURL = this . sb . getURL ( request . referrerhash ( ) ) ;
DigestURL refererURL = this . sb . getURL ( request . referrerhash ( ) ) ;
if ( refererURL ! = null ) {
if ( refererURL ! = null ) {
requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true ) ) ;
requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true ) ) ;
}
}
}
}
requestHeader . put ( HeaderFramework . ACCEPT , this . sb . getConfig ( "crawler.http.accept" , DEFAULT_ACCEPT ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT , this . sb . getConfig ( "crawler.http.accept" , DEFAULT_ACCEPT ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE ,
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE ,
this . sb . getConfig ( "crawler.http.acceptLanguage" , DEFAULT_LANGUAGE ) ) ;
this . sb . getConfig ( "crawler.http.acceptLanguage" , DEFAULT_LANGUAGE ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET ,
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET ,
this . sb . getConfig ( "crawler.http.acceptCharset" , DEFAULT_CHARSET ) ) ;
this . sb . getConfig ( "crawler.http.acceptCharset" , DEFAULT_CHARSET ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING ,
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING ,
this . sb . getConfig ( "crawler.http.acceptEncoding" , DEFAULT_ENCODING ) ) ;
this . sb . getConfig ( "crawler.http.acceptEncoding" , DEFAULT_ENCODING ) ) ;
return requestHeader ;
return requestHeader ;
}
}
private Response load ( final Request request , CrawlProfile profile , final int retryCount , final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException {
private Response load ( final Request request , CrawlProfile profile , final int retryCount , final int maxFileSize , final BlacklistType blacklistType , final ClientIdentification . Agent agent ) throws IOException {
@ -347,10 +350,10 @@ public final class HTTPLoader {
// resolve yacy and yacyh domains
// resolve yacy and yacyh domains
final AlternativeDomainNames yacyResolver = this . sb . peers ;
final AlternativeDomainNames yacyResolver = this . sb . peers ;
if ( yacyResolver ! = null ) {
if ( yacyResolver ! = null ) {
final String yAddress = yacyResolver . resolve ( host ) ;
final String yAddress = yacyResolver . resolve ( host ) ;
if ( yAddress ! = null ) {
if ( yAddress ! = null ) {
url = new DigestURL ( url . getProtocol ( ) + "://" + yAddress + path ) ;
url = new DigestURL ( url . getProtocol ( ) + "://" + yAddress + path ) ;
}
}
}
}
// take a file from the net
// take a file from the net
@ -366,41 +369,39 @@ public final class HTTPLoader {
client . setHeader ( requestHeader . entrySet ( ) ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
// send request
// send request
final byte [ ] responseBody = client . GETbytes ( url , sb . getConfig ( SwitchboardConstants . ADMIN_ACCOUNT_USER_NAME , "admin" ) , sb . getConfig ( SwitchboardConstants . ADMIN_ACCOUNT_B64MD5 , "" ) , maxFileSize , false ) ;
final byte [ ] responseBody = client . GETbytes ( url , sb . getConfig ( SwitchboardConstants . ADMIN_ACCOUNT_USER_NAME , "admin" ) , sb . getConfig ( SwitchboardConstants . ADMIN_ACCOUNT_B64MD5 , "" ) , maxFileSize , false ) ;
final int statusCode = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
final int statusCode = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
final ResponseHeader responseHeader = new ResponseHeader ( statusCode , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
final ResponseHeader responseHeader = new ResponseHeader ( statusCode , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
String requestURLString = request . url ( ) . toNormalform ( true ) ;
String requestURLString = request . url ( ) . toNormalform ( true ) ;
// check redirection
// check redirection
if ( statusCode > 299 & & statusCode < 310 ) {
if ( statusCode > 299 & & statusCode < 310 ) {
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , client . getHttpResponse ( ) . getStatusLine ( ) ,
final DigestURL redirectionUrl = extractRedirectURL ( request , profile , url , client . getHttpResponse ( ) . getStatusLine ( ) ,
responseHeader , requestURLString ) ;
responseHeader , requestURLString ) ;
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
// we have two use cases here: loading from a crawl or just loading the url. Check this:
// we have two use cases here: loading from a crawl or just loading the url. Check this:
if ( profile ! = null & & ! CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) {
if ( profile ! = null & & ! CrawlSwitchboard . DEFAULT_PROFILES . contains ( profile . name ( ) ) ) {
// put redirect url on the crawler queue to repeat a double-check
// put redirect url on the crawler queue to repeat a double-check
/ * We have to clone the request instance and not to modify directly its URL ,
/ * We have to clone the request instance and not to modify directly its URL ,
* otherwise the stackCrawl ( ) function would reject it , because detecting it as already in the activeWorkerEntries * /
* otherwise the stackCrawl ( ) function would reject it , because detecting it as already in the activeWorkerEntries * /
Request redirectedRequest = new Request ( request . initiator ( ) ,
Request redirectedRequest = new Request ( request . initiator ( ) ,
redirectionUrl ,
redirectionUrl ,
request . referrerhash ( ) ,
request . referrerhash ( ) ,
request . name ( ) ,
request . name ( ) ,
request . appdate ( ) ,
request . appdate ( ) ,
request . profileHandle ( ) ,
request . profileHandle ( ) ,
request . depth ( ) ,
request . depth ( ) ,
request . timezoneOffset ( ) ) ;
request . timezoneOffset ( ) ) ;
String rejectReason = this . sb . crawlStacker . stackCrawl ( redirectedRequest ) ;
String rejectReason = this . sb . crawlStacker . stackCrawl ( redirectedRequest ) ;
// in the end we must throw an exception (even if this is not an error, just to abort the current process
// in the end we must throw an exception (even if this is not an error, just to abort the current process
if ( rejectReason ! = null ) {
if ( rejectReason ! = null ) {
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason ) ;
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason ) ;
}
}
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl . toNormalform ( false ) + " placed on crawler queue for double-check" ) ;
throw new IOException ( "CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl . toNormalform ( false ) + " placed on crawler queue for double-check" ) ;
}
}
// if we are already doing a shutdown we don't need to retry crawling
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , statusCode ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , statusCode ) ;
@ -410,15 +411,15 @@ public final class HTTPLoader {
// retry crawling with new url
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
request . redirectURL ( redirectionUrl ) ;
return load ( request , profile , retryCount - 1 , maxFileSize , blacklistType , agent ) ;
return load ( request , profile , retryCount - 1 , maxFileSize , blacklistType , agent ) ;
}
}
// we don't want to follow redirects
// we don't want to follow redirects
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , statusCode ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
} else if ( responseBody = = null ) {
} else if ( responseBody = = null ) {
// no response, reject file
// no response, reject file
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "no response body" , statusCode ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "no response body" , statusCode ) ;
throw new IOException ( "REJECTED EMPTY RESPONSE BODY '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
throw new IOException ( "REJECTED EMPTY RESPONSE BODY '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
} else if ( statusCode = = 200 | | statusCode = = 203 ) {
} else if ( statusCode = = 200 | | statusCode = = 203 ) {
// the transfer is ok
// the transfer is ok
// we write the new cache entry to file system directly
// we write the new cache entry to file system directly
@ -427,8 +428,8 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
// check length again in case it was not possible to get the length before loading
if ( maxFileSize > = 0 & & contentLength > maxFileSize ) {
if ( maxFileSize > = 0 & & contentLength > maxFileSize ) {
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "file size limit exceeded" , statusCode ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . FINAL_PROCESS_CONTEXT , "file size limit exceeded" , statusCode ) ;
throw new IOException ( "REJECTED URL " + request . url ( ) + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$" ) ;
throw new IOException ( "REJECTED URL " + request . url ( ) + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$" ) ;
}
}
// create a new cache entry
// create a new cache entry
@ -442,9 +443,9 @@ public final class HTTPLoader {
) ;
) ;
return response ;
return response ;
} else {
} else {
// if the response has not the right response type then reject file
// if the response has not the right response type then reject file
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , statusCode ) ;
this . sb . crawlQueues . errorURL . push ( request . url ( ) , request . depth ( ) , profile , FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , statusCode ) ;
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL '" + requestURLString + "'$" ) ;
}
}
}
}
@ -485,17 +486,17 @@ public final class HTTPLoader {
final HTTPClient client = new HTTPClient ( agent ) ;
final HTTPClient client = new HTTPClient ( agent ) ;
client . setTimout ( 20000 ) ;
client . setTimout ( 20000 ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
final byte [ ] responseBody = client . GETbytes ( request . url ( ) , null , null , false ) ;
final byte [ ] responseBody = client . GETbytes ( request . url ( ) , null , null , false ) ;
final int code = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
final int code = client . getHttpResponse ( ) . getStatusLine ( ) . getStatusCode ( ) ;
final ResponseHeader header = new ResponseHeader ( code , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
final ResponseHeader header = new ResponseHeader ( code , client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
// FIXME: 30*-handling (bottom) is never reached
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
// we always get the final content because httpClient.followRedirects = true
if ( responseBody ! = null & & ( code = = 200 | | code = = 203 ) ) {
if ( responseBody ! = null & & ( code = = 200 | | code = = 203 ) ) {
// the transfer is ok
// the transfer is ok
//statistics:
//statistics:
ByteCount . addAccountCount ( ByteCount . CRAWLER , responseBody . length ) ;
ByteCount . addAccountCount ( ByteCount . CRAWLER , responseBody . length ) ;
// we write the new cache entry to file system directly
// we write the new cache entry to file system directly
@ -513,7 +514,7 @@ public final class HTTPLoader {
} else if ( code > 299 & & code < 310 ) {
} else if ( code > 299 & & code < 310 ) {
if ( header . containsKey ( HeaderFramework . LOCATION ) ) {
if ( header . containsKey ( HeaderFramework . LOCATION ) ) {
// getting redirection URL
// getting redirection URL
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
redirectionUrlString = redirectionUrlString . trim ( ) ;
redirectionUrlString = redirectionUrlString . trim ( ) ;
if ( redirectionUrlString . isEmpty ( ) ) {
if ( redirectionUrlString . isEmpty ( ) ) {
@ -535,7 +536,7 @@ public final class HTTPLoader {
}
}
} else {
} else {
// if the response has not the right response type then reject file
// if the response has not the right response type then reject file
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
}
}
return response ;
return response ;
}
}