@ -38,6 +38,7 @@ import net.yacy.kelondro.io.ByteCount;
import net.yacy.kelondro.logging.Log ;
import net.yacy.repository.Blacklist ;
import net.yacy.search.Switchboard ;
import net.yacy.search.SwitchboardConstants ;
import net.yacy.search.index.Segments ;
import de.anomic.crawler.CrawlProfile ;
import de.anomic.crawler.Latency ;
@ -127,6 +128,7 @@ public final class HTTPLoader {
client . setRedirecting ( false ) ; // we want to handle redirection ourselves, so we don't index pages twice
client . setTimout ( this . socketTimeout ) ;
client . setHeader ( requestHeader . entrySet ( ) ) ;
// send request
final byte [ ] responseBody = client . GETbytes ( url , maxFileSize ) ;
final ResponseHeader header = new ResponseHeader ( client . getHttpResponse ( ) . getAllHeaders ( ) ) ;
@ -134,6 +136,7 @@ public final class HTTPLoader {
if ( code > 299 & & code < 310 ) {
// redirection (content may be empty)
if ( this . sb . getConfigBool ( SwitchboardConstants . CRAWLER_FOLLOW_REDIRECTS , true ) ) {
if ( header . containsKey ( HeaderFramework . LOCATION ) ) {
// getting redirection URL
String redirectionUrlString = header . get ( HeaderFramework . LOCATION ) ;
@ -172,6 +175,11 @@ public final class HTTPLoader {
this . sb . crawlQueues . errorURL . push ( request , this . sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no redirection url provided" , code ) ;
throw new IOException ( "REJECTED EMTPY REDIRECTION '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
}
} else {
// we don't want to follow redirects
this . sb . crawlQueues . errorURL . push ( request , this . sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . FINAL_PROCESS_CONTEXT , "redirection not wanted" , code ) ;
throw new IOException ( "REJECTED UNWANTED REDIRECTION '" + client . getHttpResponse ( ) . getStatusLine ( ) + "' for URL " + request . url ( ) . toString ( ) ) ;
}
} else if ( responseBody = = null ) {
// no response, reject file
this . sb . crawlQueues . errorURL . push ( request , this . sb . peers . mySeed ( ) . hash . getBytes ( ) , new Date ( ) , 1 , FailCategory . TEMPORARY_NETWORK_FAILURE , "no response body" , code ) ;