fixed problem with not http client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5801 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · c0e8ed5461
parent 6504b21cea
commit c0e8ed5461
2 changed files with 97 additions and 104 deletions
--- a/source/de/anomic/crawler/HTTPLoader.java
+++ b/source/de/anomic/crawler/HTTPLoader.java
@ -144,106 +144,105 @@ public final class HTTPLoader {
            final httpClient client = new httpClient(socketTimeout, requestHeader);
            
            httpResponse res = null;
-            //try {
+            try {
                // send request
                res = client.GET(entry.url().toString());

-            if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
-                // the transfer is ok
-                
-                // create a new cache entry
-                htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine()); 
-                
-                // request has been placed and result has been returned. work off response
-                //try {
-                    if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
-                        
-                        // get the content length and check if the length is allowed
-                        long contentLength = res.getResponseHeader().getContentLength();
-                        if (maxFileSize >= 0 && contentLength > maxFileSize) {
-                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");                    
-                            throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
-                        }
-
-                        // we write the new cache entry to file system directly
-                        res.setAccountingName("CRAWLER");
-                        final byte[] responseBody = res.getData();
-                        contentLength = responseBody.length;
-                        
-                        // check length again in case it was not possible to get the length before loading
-                        if (maxFileSize >= 0 && contentLength > maxFileSize) {
-                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");                    
-                            throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
-                        }
-                        
-                        htCache.setCacheArray(responseBody);
-                    } else {
-                        // if the response has not the right file type then reject file
-                        sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
-                        throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
-                    }
-                    return htCache;
-                    /*
-                } catch (final SocketException e) {
-                    // this may happen if the client suddenly closes its connection
-                    // maybe the user has stopped loading
-                    // in that case, we are not responsible and just forget it
-                    // but we clean the cache also, since it may be only partial
-                    // and most possible corrupted
-                    this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
-                    sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
-                    htCache = null;
-                }*/
-            } else if (res.getStatusLine().startsWith("30")) {
-                    if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
-                        // getting redirection URL
-                        String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
-                        redirectionUrlString = redirectionUrlString.trim();
-
-                        if (redirectionUrlString.length() == 0) {
-                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
-                            throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
-                        }
-                        
-                        // normalizing URL
-                        final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
-
-                        // restart crawling with new url
-                        this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
-                        this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
-
-                        // if we are already doing a shutdown we don't need to retry crawling
-                        if (Thread.currentThread().isInterrupted()) {
-                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
-                            throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
+                if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
+                    // the transfer is ok
+                    
+                    // create a new cache entry
+                    htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine()); 
+                    
+                    // request has been placed and result has been returned. work off response
+                    //try {
+                        if (plasmaParser.supportedContent(parserMode, entry.url(), res.getResponseHeader().mime())) {
+                            
+                            // get the content length and check if the length is allowed
+                            long contentLength = res.getResponseHeader().getContentLength();
+                            if (maxFileSize >= 0 && contentLength > maxFileSize) {
+                                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");                    
+                                throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
+                            }
+    
+                            // we write the new cache entry to file system directly
+                            res.setAccountingName("CRAWLER");
+                            final byte[] responseBody = res.getData();
+                            contentLength = responseBody.length;
+                            
+                            // check length again in case it was not possible to get the length before loading
+                            if (maxFileSize >= 0 && contentLength > maxFileSize) {
+                                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "file size limit exceeded");                    
+                                throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes.");
+                            }
+                            
+                            htCache.setCacheArray(responseBody);
+                        } else {
+                            // if the response has not the right file type then reject file
+                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong mime type or wrong extension");
+                            throw new IOException("REJECTED WRONG MIME/EXT TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString());
                        }
-
-                        // generating url hash
-                        final String urlhash = redirectionUrl.hash();
-                        
-                        // check if the url was already indexed
-                        final String dbname = sb.urlExists(urlhash);
-                        if (dbname != null) {
-                            sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content");
-                            throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
+                        return htCache;
+                        /*
+                    } catch (final SocketException e) {
+                        // this may happen if the client suddenly closes its connection
+                        // maybe the user has stopped loading
+                        // in that case, we are not responsible and just forget it
+                        // but we clean the cache also, since it may be only partial
+                        // and most possible corrupted
+                        this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
+                        sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
+                        htCache = null;
+                    }*/
+                } else if (res.getStatusLine().startsWith("30")) {
+                        if (res.getResponseHeader().containsKey(httpRequestHeader.LOCATION)) {
+                            // getting redirection URL
+                            String redirectionUrlString = res.getResponseHeader().get(httpRequestHeader.LOCATION);
+                            redirectionUrlString = redirectionUrlString.trim();
+    
+                            if (redirectionUrlString.length() == 0) {
+                                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection header empy");
+                                throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
+                            }
+                            
+                            // normalizing URL
+                            final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
+    
+                            // restart crawling with new url
+                            this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
+                            this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
+    
+                            // if we are already doing a shutdown we don't need to retry crawling
+                            if (Thread.currentThread().isInterrupted()) {
+                                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "server shutdown");
+                                throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
+                            }
+    
+                            // generating url hash
+                            final String urlhash = redirectionUrl.hash();
+                            
+                            // check if the url was already indexed
+                            final String dbname = sb.urlExists(urlhash);
+                            if (dbname != null) {
+                                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "redirection to double content");
+                                throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
+                            }
+                            
+                            // retry crawling with new url
+                            entry.redirectURL(redirectionUrl);
+                            return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
                        }
-                        
-                        // retry crawling with new url
-                        entry.redirectURL(redirectionUrl);
-                        return load(entry, plasmaParser.PARSER_MODE_URLREDIRECTOR, retryCount - 1);
-                    }
-            } else {
-                // if the response has not the right response type then reject file
-                sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() +  ")");
-                throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
-            }
-            /*
+                } else {
+                    // if the response has not the right response type then reject file
+                    sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.peers().mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() +  ")");
+                    throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
+                }
            } finally {
                if(res != null) {
                    // release connection
                    res.closeStream();
                }
-            }*/
+            }
            return htCache;
    }
    
--- a/source/de/anomic/http/httpClient.java
+++ b/source/de/anomic/http/httpClient.java
@ -299,18 +299,6 @@ public class httpClient {
        return execute(post);
    }

-    /**
-     * This method sends several data at once via a POST request (multipart-message)
-     * 
-     * @param uri
-     * @param multiparts
-     * @return
-     * @throws IOException
-     */
-    public httpResponse POST(final String uri, final List<Part> multiparts) throws IOException {
-        return POST(uri, multiparts, false);
-    }
-
    /**
     * This method sends several data at once via a POST request (multipart-message), maybe compressed
     * 
@ -648,7 +636,7 @@ public class httpClient {
                        "this is not a binary file ;)".getBytes())));
                System.out.println("POST " + files.size() + " elements to " + url);
                final httpClient client = new httpClient(1000);
-                resp = client.POST(url, files);
+                resp = client.POST(url, files, false);
                System.out.println("----- Header: -----");
                System.out.println(resp.getResponseHeader().toString());
                System.out.println("----- Body:   -----");
@ -763,11 +751,17 @@ public class httpClient {
        final httpClient client = new httpClient(timeout, header);

        // do the request
+        httpResponse response = null;
        try {
-            final httpResponse response = client.GET(uri);
+            response = client.GET(uri);
            return response.getData();
        } catch (final IOException e) {
            Log.logWarning("HTTPC", "wget(" + uri + ") failed: " + e.getMessage());
+        } finally {
+            // release connection
+            if (response != null) {
+                response.closeStream();
+            }
        }
        return null;
    }