@ -1,4 +1,4 @@
// HTTPLoader.java
// HTTPLoader.java
// ---------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
@ -39,7 +39,6 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist ;
import net.yacy.search.Switchboard ;
import net.yacy.search.index.Segments ;
import de.anomic.crawler.CrawlProfile ;
import de.anomic.crawler.Latency ;
import de.anomic.crawler.ZURL.FailCategory ;
@ -51,80 +50,82 @@ public final class HTTPLoader {
private static final String DEFAULT_ENCODING = "gzip,deflate" ;
private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5" ;
private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7" ;
private static final String DEFAULT_ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8" ;
public static final int DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5 ;
/ * *
* The socket timeout that should be used
* /
private final int socketTimeout ;
private final Switchboard sb ;
private final Log log ;
public HTTPLoader ( final Switchboard sb , final Log theLog ) {
this . sb = sb ;
this . log = theLog ;
// refreshing timeout value
this . socketTimeout = ( int ) sb . getConfigLong ( "crawler.clientTimeout" , 10000 ) ;
}
}
public Response load ( final Request entry , final int maxFileSize , final boolean checkBlacklist ) throws IOException {
long start = System . currentTimeMillis ( ) ;
Response doc = load ( entry , DEFAULT_CRAWLING_RETRY_COUNT , maxFileSize , checkBlacklist ) ;
final long start = System . currentTimeMillis ( ) ;
final Response doc = load ( entry , DEFAULT_CRAWLING_RETRY_COUNT , maxFileSize , checkBlacklist ) ;
Latency . update ( entry . url ( ) , System . currentTimeMillis ( ) - start ) ;
return doc ;
}
private Response load ( final Request request , final int retryCount , final int maxFileSize , final boolean checkBlacklist ) throws IOException {
if ( retryCount < 0 ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection counter exceeded" , - 1 ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection counter exceeded" , - 1 ) ;
throw new IOException ( "Redirection counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted." ) ;
}
DigestURI url = request . url ( ) ;
final String host = url . getHost ( ) ;
if ( host = = null | | host . length ( ) < 2 ) throw new IOException ( "host is not well-formed: '" + host + "'" ) ;
final String path = url . getFile ( ) ;
int port = url . getPort ( ) ;
final boolean ssl = url . getProtocol ( ) . equals ( "https" ) ;
if ( port < 0 ) port = ( ssl ) ? 443 : 80 ;
// check if url is in blacklist
final String hostlow = host . toLowerCase ( ) ;
if ( checkBlacklist & & Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , hostlow , path ) ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , "url in blacklist" , - 1 ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , "url in blacklist" , - 1 ) ;
throw new IOException ( "CRAWLER Rejecting URL '" + request . url ( ) . toString ( ) + "'. URL is in blacklist." ) ;
}
// resolve yacy and yacyh domains
AlternativeDomainNames yacyResolver = HTTPDemon . getAlternativeResolver ( ) ;
final AlternativeDomainNames yacyResolver = HTTPDemon . getAlternativeResolver ( ) ;
if ( yacyResolver ! = null ) {
String yAddress = yacyResolver . resolve ( host ) ;
final String yAddress = yacyResolver . resolve ( host ) ;
if ( yAddress ! = null ) {
url = new DigestURI ( url . getProtocol ( ) + "://" + yAddress + path ) ;
}
}
// take a file from the net
Response response = null ;
// create a request header
final RequestHeader requestHeader = new RequestHeader ( ) ;
requestHeader . put ( HeaderFramework . USER_AGENT , ClientIdentification . getUserAgent ( ) ) ;
DigestURI refererURL = null ;
if ( request . referrerhash ( ) ! = null ) refererURL = sb. getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
if ( request . referrerhash ( ) ! = null ) refererURL = this . sb. getURL ( Segments . Process . LOCALCRAWLING , request . referrerhash ( ) ) ;
if ( refererURL ! = null ) requestHeader . put ( RequestHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE , sb . getConfig ( "crawler.http.acceptLanguage" , DEFAULT_LANGUAGE ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET , sb . getConfig ( "crawler.http.acceptCharset" , DEFAULT_CHARSET ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING , sb . getConfig ( "crawler.http.acceptEncoding" , DEFAULT_ENCODING ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT , this . sb . getConfig ( "crawler.http.accept" , DEFAULT_ACCEPT ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_LANGUAGE , this . sb . getConfig ( "crawler.http.acceptLanguage" , DEFAULT_LANGUAGE ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_CHARSET , this . sb . getConfig ( "crawler.http.acceptCharset" , DEFAULT_CHARSET ) ) ;
requestHeader . put ( HeaderFramework . ACCEPT_ENCODING , this . sb . getConfig ( "crawler.http.acceptEncoding" , DEFAULT_ENCODING ) ) ;
// HTTP-Client
final HTTPClient client = new HTTPClient ( ) ;
client . setRedirecting ( false ) ; // we want to handle redirection ourselves, so we don't index pages twice
client . setTimout ( socketTimeout) ;
client . setTimout ( this . socketTimeout) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
// send request
final byte [ ] responseBody = client . GETbytes ( url , maxFileSize ) ;
@ -139,10 +140,10 @@ public final class HTTPLoader {
redirectionUrlString = redirectionUrlString . trim ( ) ;
if ( redirectionUrlString . length ( ) = = 0 ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection header empy" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection header empy" , code ) ;
throw new IOException ( "CRAWLER Redirection of URL=" + request . url ( ) . toString ( ) + " aborted. Location header is empty." ) ;
}
// normalizing URL
final DigestURI redirectionUrl = new DigestURI ( MultiProtocolURI . newURL ( request . url ( ) , redirectionUrlString ) ) ;
@ -152,48 +153,48 @@ public final class HTTPLoader {
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_LOAD_CONTEXT , "server shutdown" , code ) ;
throw new IOException ( "CRAWLER Retry of URL=" + request . url ( ) . toString ( ) + " aborted because of server shutdown." ) ;
}
// check if the url was already indexed
final String dbname = sb. urlExists ( Segments . Process . LOCALCRAWLING , redirectionUrl . hash ( ) ) ;
final String dbname = this . sb. urlExists ( Segments . Process . LOCALCRAWLING , redirectionUrl . hash ( ) ) ;
if ( dbname ! = null ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection to double content" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "redirection to double content" , code ) ;
throw new IOException ( "CRAWLER Redirection of URL=" + request . url ( ) . toString ( ) + " ignored. The url appears already in db " + dbname ) ;
}
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
return load ( request , retryCount - 1 , maxFileSize , checkBlacklist ) ;
} else {
// no redirection url provided
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no redirection url provided" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no redirection url provided" , code ) ;
throw new IOException ( "REJECTED EMTPY REDIRECTION '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
}
} else if ( responseBody = = null ) {
// no response, reject file
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no response body" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no response body" , code ) ;
throw new IOException ( "REJECTED EMPTY RESPONSE BODY '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
} else if ( code = = 200 | | code = = 203 ) {
// the transfer is ok
// we write the new cache entry to file system directly
long contentLength = responseBody . length ;
final long contentLength = responseBody . length ;
ByteCount . addAccountCount ( ByteCount . CRAWLER , contentLength ) ;
// check length again in case it was not possible to get the length before loading
if ( maxFileSize > 0 & & contentLength > maxFileSize ) {
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_PROCESS_CONTEXT , "file size limit exceeded" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_PROCESS_CONTEXT , "file size limit exceeded" , code ) ;
throw new IOException ( "REJECTED URL " + request . url ( ) + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)" ) ;
}
// create a new cache entry
final CrawlProfile profile = sb. crawler . getActive ( request . profileHandle ( ) . getBytes ( ) ) ;
final CrawlProfile profile = this . sb. crawler . getActive ( request . profileHandle ( ) . getBytes ( ) ) ;
response = new Response (
request ,
requestHeader ,
header ,
header ,
Integer . toString ( code ) ,
profile ,
responseBody
@ -202,37 +203,37 @@ public final class HTTPLoader {
return response ;
} else {
// if the response has not the right response type then reject file
sb. crawlQueues . errorURL . push ( request , sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , code ) ;
this . sb. crawlQueues . errorURL . push ( request , this . sb. peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "wrong http status code" , code ) ;
throw new IOException ( "REJECTED WRONG STATUS TYPE '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
}
}
public static Response load ( final Request request ) throws IOException {
return load ( request , 3 ) ;
}
private static Response load ( final Request request , int retryCount ) throws IOException {
private static Response load ( final Request request , final int retryCount ) throws IOException {
if ( retryCount < 0 ) {
throw new IOException ( "Redirection counter exceeded for URL " + request . url ( ) . toString ( ) + ". Processing aborted." ) ;
}
final String host = request . url ( ) . getHost ( ) ;
if ( host = = null | | host . length ( ) < 2 ) throw new IOException ( "host is not well-formed: '" + host + "'" ) ;
final String path = request . url ( ) . getFile ( ) ;
int port = request . url ( ) . getPort ( ) ;
final boolean ssl = request . url ( ) . getProtocol ( ) . equals ( "https" ) ;
if ( port < 0 ) port = ( ssl ) ? 443 : 80 ;
// check if url is in blacklist
final String hostlow = host . toLowerCase ( ) ;
if ( Switchboard . urlBlacklist ! = null & & Switchboard . urlBlacklist . isListed ( Blacklist . BLACKLIST_CRAWLER , hostlow , path ) ) {
throw new IOException ( "CRAWLER Rejecting URL '" + request . url ( ) . toString ( ) + "'. URL is in blacklist." ) ;
}
// take a file from the net
Response response = null ;
// create a request header
final RequestHeader requestHeader = new RequestHeader ( ) ;
requestHeader . put ( HeaderFramework . USER_AGENT , ClientIdentification . getUserAgent ( ) ) ;
@ -251,17 +252,17 @@ public final class HTTPLoader {
if ( responseBody ! = null & & ( code = = 200 | | code = = 203 ) ) {
// the transfer is ok
//statistics:
ByteCount . addAccountCount ( ByteCount . CRAWLER , responseBody . length ) ;
// we write the new cache entry to file system directly
// create a new cache entry
response = new Response (
request ,
requestHeader ,
header ,
header ,
Integer . toString ( code ) ,
null ,
responseBody
@ -277,16 +278,16 @@ public final class HTTPLoader {
if ( redirectionUrlString . length ( ) = = 0 ) {
throw new IOException ( "CRAWLER Redirection of URL=" + request . url ( ) . toString ( ) + " aborted. Location header is empty." ) ;
}
// normalizing URL
final DigestURI redirectionUrl = new DigestURI ( MultiProtocolURI . newURL ( request . url ( ) , redirectionUrlString ) ) ;
// if we are already doing a shutdown we don't need to retry crawling
if ( Thread . currentThread ( ) . isInterrupted ( ) ) {
throw new IOException ( "CRAWLER Retry of URL=" + request . url ( ) . toString ( ) + " aborted because of server shutdown." ) ;
}
// retry crawling with new url
request . redirectURL ( redirectionUrl ) ;
return load ( request , retryCount - 1 ) ;
@ -297,5 +298,5 @@ public final class HTTPLoader {
}
return response ;
}
}