fixed problem with not http client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5801 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 6504b21cea
commit c0e8ed5461

@ -144,106 +144,105 @@ public final class HTTPLoader {
final httpClient client = new httpClient(socketTimeout, requestHeader); final httpClient client = new httpClient(socketTimeout, requestHeader);
httpResponse res = null; httpResponse res = null;
//try { try {
// send request // send request
res = client.GET(entry.url().toString()); res = client.GET(entry.url().toString());
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok // the transfer is ok
// create a new cache entry // create a new cache entry
htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine()); htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine());
// request has been placed and result has been returned. work off response // request has been placed and result has been returned. work off response
//try { //try {
if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) { if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
// get the content length and check if the length is allowed // get the content length and check if the length is allowed
long contentLength = res.getResponseHeader().getContentLength(); long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize >= 0 && contentLength > maxFileSize) { if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded"); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
} }
// we write the new cache entry to file system directly // we write the new cache entry to file system directly
res.setAccountingName("CRAWLER"); res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData(); final byte[] responseBody = res.getData();
contentLength = responseBody.length; contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading // check length again in case it was not possible to get the length before loading
if (maxFileSize >= 0 && contentLength > maxFileSize) { if (maxFileSize >= 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded"); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes."); throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
} }
htCache.setCacheArray(responseBody); htCache.setCacheArray(responseBody);
} else { } else {
// if the response has not the right file type then reject file // if the response has not the right file type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension"); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
}
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
} }
return htCache;
// generating url hash /*
final String urlhash = redirectionUrl.hash(); } catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// check if the url was already indexed // maybe the user has stopped loading
final String dbname = sb.urlExists(urlhash); // in that case, we are not responsible and just forget it
if (dbname != null) { // but we clean the cache also, since it may be only partial
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content"); // and most possible corrupted
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname); this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
}
// generating url hash
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
entry.redirectURL(redirectionUrl);
return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
} }
} else {
// retry crawling with new url // if the response has not the right response type then reject file
entry.redirectURL(redirectionUrl); sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1); throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
} }
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
}
/*
} finally { } finally {
if(res != null) { if(res != null) {
// release connection // release connection
res.closeStream(); res.closeStream();
} }
}*/ }
return htCache; return htCache;
} }

@ -299,18 +299,6 @@ public class httpClient {
return execute(post); return execute(post);
} }
/**
* This method sends several data at once via a POST request (multipart-message)
*
* @param uri
* @param multiparts
* @return
* @throws IOException
*/
public httpResponse POST(final String uri, final List<Part> multiparts) throws IOException {
return POST(uri, multiparts, false);
}
/** /**
* This method sends several data at once via a POST request (multipart-message), maybe compressed * This method sends several data at once via a POST request (multipart-message), maybe compressed
* *
@ -648,7 +636,7 @@ public class httpClient {
"this is not a binary file ;)".getBytes()))); "this is not a binary file ;)".getBytes())));
System.out.println("POST " + files.size() + " elements to " + url); System.out.println("POST " + files.size() + " elements to " + url);
final httpClient client = new httpClient(1000); final httpClient client = new httpClient(1000);
resp = client.POST(url, files); resp = client.POST(url, files, false);
System.out.println("----- Header: -----"); System.out.println("----- Header: -----");
System.out.println(resp.getResponseHeader().toString()); System.out.println(resp.getResponseHeader().toString());
System.out.println("----- Body: -----"); System.out.println("----- Body: -----");
@ -763,11 +751,17 @@ public class httpClient {
final httpClient client = new httpClient(timeout, header); final httpClient client = new httpClient(timeout, header);
// do the request // do the request
httpResponse response = null;
try { try {
final httpResponse response = client.GET(uri); response = client.GET(uri);
return response.getData(); return response.getData();
} catch (final IOException e) { } catch (final IOException e) {
Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage()); Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage());
} finally {
// release connection
if (response != null) {
response.closeStream();
}
} }
return null; return null;
} }

Loading…
Cancel
Save