@ -72,7 +72,11 @@ import de.anomic.yacy.yacyURL;
public final class HTTPLoader {
public final class HTTPLoader {
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5 ;
private static final String DEFAULT_ENCODING = "gzip,deflate" ;
private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5" ;
private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7" ;
private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10 ;
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5 ;
private static final String crawlerUserAgent = "yacybot (" + HttpClient . getSystemOST ( ) + ") http://yacy.net/bot.html" ;
private static final String crawlerUserAgent = "yacybot (" + HttpClient . getSystemOST ( ) + ") http://yacy.net/bot.html" ;
/ * *
/ * *
@ -83,11 +87,11 @@ public final class HTTPLoader {
/ * *
/ * *
* The maximum allowed file size
* The maximum allowed file size
* /
* /
private long maxFileSize = - 1 ;
//private long maxFileSize = -1;
private String acceptEncoding ;
//private String acceptEncoding;
private String acceptLanguage ;
//private String acceptLanguage;
private String acceptCharset ;
//private String acceptCharset;
private plasmaSwitchboard sb ;
private plasmaSwitchboard sb ;
private serverLog log ;
private serverLog log ;
@ -97,15 +101,7 @@ public final class HTTPLoader {
// refreshing timeout value
// refreshing timeout value
this . socketTimeout = ( int ) sb . getConfigLong ( "crawler.clientTimeout" , 10000 ) ;
this . socketTimeout = ( int ) sb . getConfigLong ( "crawler.clientTimeout" , 10000 ) ;
}
// maximum allowed file size
this . maxFileSize = sb . getConfigLong ( "crawler.http.maxFileSize" , - 1 ) ;
// some http header values
this . acceptEncoding = sb . getConfig ( "crawler.http.acceptEncoding" , "gzip,deflate" ) ;
this . acceptLanguage = sb . getConfig ( "crawler.http.acceptLanguage" , "en-us,en;q=0.5" ) ;
this . acceptCharset = sb . getConfig ( "crawler.http.acceptCharset" , "ISO-8859-1,utf-8;q=0.7,*;q=0.7" ) ;
}
/ * *
/ * *
* @param entry
* @param entry
@ -164,14 +160,10 @@ public final class HTTPLoader {
requestHeader . put ( httpHeader . USER_AGENT , crawlerUserAgent ) ;
requestHeader . put ( httpHeader . USER_AGENT , crawlerUserAgent ) ;
yacyURL refererURL = null ;
yacyURL refererURL = null ;
if ( entry . referrerhash ( ) ! = null ) refererURL = sb . getURL ( entry . referrerhash ( ) ) ;
if ( entry . referrerhash ( ) ! = null ) refererURL = sb . getURL ( entry . referrerhash ( ) ) ;
if ( refererURL ! = null )
if ( refererURL ! = null ) requestHeader . put ( httpHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
requestHeader . put ( httpHeader . REFERER , refererURL . toNormalform ( true , true ) ) ;
requestHeader . put ( httpHeader . ACCEPT_LANGUAGE , sb . getConfig ( "crawler.http.acceptLanguage" , DEFAULT_LANGUAGE ) ) ;
if ( this . acceptLanguage ! = null & & this . acceptLanguage . length ( ) > 0 )
requestHeader . put ( httpHeader . ACCEPT_CHARSET , sb . getConfig ( "crawler.http.acceptCharset" , DEFAULT_CHARSET ) ) ;
requestHeader . put ( httpHeader . ACCEPT_LANGUAGE , this . acceptLanguage ) ;
requestHeader . put ( httpHeader . ACCEPT_ENCODING , sb . getConfig ( "crawler.http.acceptEncoding" , DEFAULT_ENCODING ) ) ;
if ( this . acceptCharset ! = null & & this . acceptCharset . length ( ) > 0 )
requestHeader . put ( httpHeader . ACCEPT_CHARSET , this . acceptCharset ) ;
if ( this . acceptEncoding ! = null & & this . acceptEncoding . length ( ) > 0 )
requestHeader . put ( httpHeader . ACCEPT_ENCODING , this . acceptEncoding ) ;
// HTTP-Client
// HTTP-Client
JakartaCommonsHttpClient client = new JakartaCommonsHttpClient ( socketTimeout , requestHeader , null ) ;
JakartaCommonsHttpClient client = new JakartaCommonsHttpClient ( socketTimeout , requestHeader , null ) ;
@ -224,15 +216,13 @@ public final class HTTPLoader {
// getting content length
// getting content length
long contentLength = res . getResponseHeader ( ) . contentLength ( ) ;
long contentLength = res . getResponseHeader ( ) . contentLength ( ) ;
// check the maximum allowed file size
// check the maximum allowed file size
if ( this . maxFileSize > - 1 ) {
if ( contentLength = = - 1 ) {
if ( contentLength = = - 1 ) {
fos = new httpdBoundedSizeOutputStream ( fos , sb . getConfigLong ( "crawler.http.maxFileSize" , DEFAULT_MAXFILESIZE ) ) ;
fos = new httpdBoundedSizeOutputStream ( fos , this . maxFileSize ) ;
} else if ( contentLength > sb . getConfigLong ( "crawler.http.maxFileSize" , DEFAULT_MAXFILESIZE ) ) {
} else if ( contentLength > this . maxFileSize ) {
this . log . logInfo ( "REJECTED URL " + entry . url ( ) + " because file size '" + contentLength + "' exceeds max filesize limit of " + sb . getConfigLong ( "crawler.http.maxFileSize" , DEFAULT_MAXFILESIZE ) + " bytes." ) ;
this . log . logInfo ( "REJECTED URL " + entry . url ( ) + " because file size '" + contentLength + "' exceeds max filesize limit of " + this . maxFileSize + " bytes." ) ;
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ) ;
sb . crawlQueues . errorURL . newEntry ( entry , sb . webIndex . seedDB . mySeed ( ) . hash , new Date ( ) , 1 , ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ) ;
return null ;
return null ;
}
}
}
// we write the new cache entry to file system directly
// we write the new cache entry to file system directly
@ -332,7 +322,7 @@ public final class HTTPLoader {
this . log . logInfo ( "CRAWLER Interruption detected because of server shutdown." ) ;
this . log . logInfo ( "CRAWLER Interruption detected because of server shutdown." ) ;
failreason = ErrorURL . DENIED_SERVER_SHUTDOWN ;
failreason = ErrorURL . DENIED_SERVER_SHUTDOWN ;
} else if ( e instanceof httpdLimitExceededException ) {
} else if ( e instanceof httpdLimitExceededException ) {
this . log . logWarning ( "CRAWLER Max file size limit '" + this . maxFileSize + "' exceeded while downloading URL " + entry . url ( ) ) ;
this . log . logWarning ( "CRAWLER Max file size limit '" + sb . getConfigLong ( "crawler.http.maxFileSize" , DEFAULT_MAXFILESIZE ) + "' exceeded while downloading URL " + entry . url ( ) ) ;
failreason = ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ;
failreason = ErrorURL . DENIED_FILESIZE_LIMIT_EXCEEDED ;
} else if ( e instanceof MalformedURLException ) {
} else if ( e instanceof MalformedURLException ) {
this . log . logWarning ( "CRAWLER Malformed URL '" + entry . url ( ) . toString ( ) + "' detected. " ) ;
this . log . logWarning ( "CRAWLER Malformed URL '" + entry . url ( ) . toString ( ) + "' detected. " ) ;
@ -358,7 +348,6 @@ public final class HTTPLoader {
this . log . logWarning ( "CRAWLER Problems detected while receiving gzip encoded content from '" + entry . url ( ) . toString ( ) +
this . log . logWarning ( "CRAWLER Problems detected while receiving gzip encoded content from '" + entry . url ( ) . toString ( ) +
"'. Retrying request without using gzip content encoding." ) ;
"'. Retrying request without using gzip content encoding." ) ;
failreason = ErrorURL . DENIED_CONTENT_DECODING_ERROR ;
failreason = ErrorURL . DENIED_CONTENT_DECODING_ERROR ;
this . acceptEncoding = null ;
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( "The host did not accept the connection within timeout of" ) > = 0 ) ) {
} else if ( ( errorMsg ! = null ) & & ( errorMsg . indexOf ( "The host did not accept the connection within timeout of" ) > = 0 ) ) {
this . log . logWarning ( "CRAWLER Timeout while trying to connect to '" + entry . url ( ) . toString ( ) +
this . log . logWarning ( "CRAWLER Timeout while trying to connect to '" + entry . url ( ) . toString ( ) +
"'. Retrying request." ) ;
"'. Retrying request." ) ;