fixed problem with not http client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5801 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 6504b21cea
commit c0e8ed5461

@ -144,106 +144,105 @@ public final class HTTPLoader {
final httpClient client = new httpClient(socketTimeout, requestHeader);
httpResponse res = null;
//try {
try {
// send request
res = client.GET(entry.url().toString());
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// create a new cache entry
htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine());
// request has been placed and result has been returned. work off response
//try {
if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
htCache.setCacheArray(responseBody);
} else {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// create a new cache entry
htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine());
// request has been placed and result has been returned. work off response
//try {
if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
// get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
}
htCache.setCacheArray(responseBody);
} else {
// if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
// generating url hash
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
}
// generating url hash
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
entry.redirectURL(redirectionUrl);
return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
}
// retry crawling with new url
entry.redirectURL(redirectionUrl);
return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
}
/*
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
}
}*/
}
return htCache;
}

@ -299,18 +299,6 @@ public class httpClient {
return execute(post);
}
/**
* This method sends several data at once via a POST request (multipart-message)
*
* @param uri
* @param multiparts
* @return
* @throws IOException
*/
public httpResponse POST(final String uri, final List<Part> multiparts) throws IOException {
return POST(uri, multiparts, false);
}
/**
* This method sends several data at once via a POST request (multipart-message), maybe compressed
*
@ -648,7 +636,7 @@ public class httpClient {
"this is not a binary file ;)".getBytes())));
System.out.println("POST " + files.size() + " elements to " + url);
final httpClient client = new httpClient(1000);
resp = client.POST(url, files);
resp = client.POST(url, files, false);
System.out.println("----- Header: -----");
System.out.println(resp.getResponseHeader().toString());
System.out.println("----- Body: -----");
@ -763,11 +751,17 @@ public class httpClient {
final httpClient client = new httpClient(timeout, header);
// do the request
httpResponse response = null;
try {
final httpResponse response = client.GET(uri);
response = client.GET(uri);
return response.getData();
} catch (final IOException e) {
Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage());
} finally {
// release connection
if (response != null) {
response.closeStream();
}
}
return null;
}

Loading…
Cancel
Save