|
|
|
@ -73,14 +73,14 @@ public final class HTTPLoader {
|
|
|
|
|
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Response load(final Request entry) throws IOException {
|
|
|
|
|
public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException {
|
|
|
|
|
long start = System.currentTimeMillis();
|
|
|
|
|
Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT);
|
|
|
|
|
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT);
|
|
|
|
|
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
|
|
|
|
|
return doc;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private Response load(final Request request, final int retryCount) throws IOException {
|
|
|
|
|
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
|
|
|
|
|
|
|
|
|
|
if (retryCount < 0) {
|
|
|
|
|
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
|
|
|
|
@ -94,11 +94,13 @@ public final class HTTPLoader {
|
|
|
|
|
if (port < 0) port = (ssl) ? 443 : 80;
|
|
|
|
|
|
|
|
|
|
// if not the right file type then reject file
|
|
|
|
|
String supportError = Parser.supportsExtension(request.url());
|
|
|
|
|
if (supportError != null) {
|
|
|
|
|
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
|
|
|
|
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
|
|
|
|
|
}
|
|
|
|
|
if (acceptOnlyParseable) {
|
|
|
|
|
String supportError = Parser.supportsExtension(request.url());
|
|
|
|
|
if (supportError != null) {
|
|
|
|
|
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
|
|
|
|
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check if url is in blacklist
|
|
|
|
|
final String hostlow = host.toLowerCase();
|
|
|
|
@ -134,13 +136,15 @@ public final class HTTPLoader {
|
|
|
|
|
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
|
|
|
|
|
// the transfer is ok
|
|
|
|
|
|
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
|
supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
|
|
|
|
|
if (supportError != null) {
|
|
|
|
|
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
|
|
|
|
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
|
|
|
|
|
if (acceptOnlyParseable) {
|
|
|
|
|
// if the response has not the right file type then reject file
|
|
|
|
|
String supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
|
|
|
|
|
if (supportError != null) {
|
|
|
|
|
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
|
|
|
|
|
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// we write the new cache entry to file system directly
|
|
|
|
|
res.setAccountingName("CRAWLER");
|
|
|
|
|
final byte[] responseBody = res.getData();
|
|
|
|
@ -199,7 +203,7 @@ public final class HTTPLoader {
|
|
|
|
|
|
|
|
|
|
// retry crawling with new url
|
|
|
|
|
request.redirectURL(redirectionUrl);
|
|
|
|
|
return load(request, retryCount - 1);
|
|
|
|
|
return load(request, acceptOnlyParseable, retryCount - 1);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// if the response has not the right response type then reject file
|
|
|
|
|