some enhancements in web caching: avoid double loading of response metadata and/or content

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6491 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · fe41a84330
parent 06d0dcde20
commit fe41a84330
8 changed files with 68 additions and 110 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -25,8 +25,8 @@
 //javac -classpath .:../Classes Status.java
 //if the shell's current path is HTROOT

+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
@ -43,7 +43,6 @@ import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.LoaderDispatcher;

 import de.anomic.crawler.retrieval.Response;
@ -158,18 +157,16 @@ public class ViewFile {
        }

        // loading the resource content as byte array
-        InputStream resource = null;
-        long resourceLength = -1;
+        byte[] resource = null;
        ResponseHeader responseHeader = null;
        String resMime = null;
        // trying to load the resource body
        try {
-            resource = Cache.getContentStream(url);
+            resource = Cache.getContent(url);
        } catch (IOException e) {
            Log.logException(e);
            resource = null;
        }
-        resourceLength = Cache.getResourceContentLength(url);
        responseHeader = Cache.getResponseHeader(url);

        // if the resource body was not cached we try to load it from web
@ -185,13 +182,7 @@ public class ViewFile {
            }

            if (entry != null) {
-                try {
-                    resource = Cache.getContentStream(url);
-                } catch (IOException e) {
-                    Log.logException(e);
-                    resource = null;
-                }
-                resourceLength = Cache.getResourceContentLength(url);
+                resource = entry.getContent();
            }

            if (resource == null) {
@ -241,19 +232,14 @@ public class ViewFile {
            // TODO: how to handle very large files here ?
            String content;
            try {
-                content = new String(FileUtils.read(resource), "UTF-8");
+                content = new String(resource, "UTF-8");
            } catch (final Exception e) {
                prop.put("error", "4");
                prop.putHTML("error_errorText", e.getMessage());
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            } finally {
-                if (resource != null)
-                    try {
-                        resource.close();
-                    } catch (final Exception e) {
-                        /* ignore this */
-                    }
+                resource = null;
            }

            prop.put("error", "0");
@ -268,7 +254,7 @@ public class ViewFile {
            // parsing the resource content
            Document document = null;
            try {
-                document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null);
+                document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), null);
                if (document == null) {
                    prop.put("error", "5");
                    prop.put("error_errorText", "Unknown error");
@ -281,12 +267,7 @@ public class ViewFile {
                prop.put("viewMode", VIEW_MODE_NO_TEXT);
                return prop;
            } finally {
-                if (resource != null)
-                    try {
-                        resource.close();
-                    } catch (final Exception e) {
-                        /* ignore this */
-                    }
+                resource = null;
            }

            resMime = document.dc_format();
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@ -23,6 +23,7 @@
 import java.awt.Container;
 import java.awt.Image;
 import java.awt.MediaTracker;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@ -84,17 +85,17 @@ public class ViewImage {
        int maxheight = post.getInt("maxheight", 0);
        final int timeout = post.getInt("timeout", 5000);
        
-        // getting the image as stream
+        // get the image as stream
        Image scaled = iconcache.get(urlString);
        if (scaled == null) {
-            Object[] resource = null;
+            byte[] resourceb = null;
            if (url != null) try {
-                resource = sb.loader.getResource(url, true, timeout, false, true);
+                resourceb = sb.loader.getResource(url, true, timeout, false, true);
            } catch (IOException e) {
                Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
            }
            byte[] imgb = null;
-            if (resource == null) {
+            if (resourceb == null) {
                if (urlString.endsWith(".ico")) {
                    // load default favicon dfltfvcn.ico
                    if (defaulticonb == null) try {
@ -108,7 +109,7 @@ public class ViewImage {
                    return null;
                }
            } else {
-                final InputStream imgStream = (InputStream) resource[0];
+                final InputStream imgStream = new ByteArrayInputStream(resourceb);
                if (imgStream == null) return null;

                // read image data
@ -138,8 +139,8 @@ public class ViewImage {
                maxwidth = (maxwidth == 0) ? w : maxwidth;
                maxheight = (maxheight == 0) ? h : maxheight;
            } else if ((w > 16) || (h > 16)) {
-                maxwidth = (int) Math.min(64.0, w * 0.6);
-                maxheight = (int) Math.min(64.0, h * 0.6);
+                maxwidth = Math.min(96, w);
+                maxheight = Math.min(96, h);
            } else {
                maxwidth = 16;
                maxheight = 16;
@ -151,7 +152,7 @@ public class ViewImage {
                final double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
                final double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
                double scale = Math.min(hs, vs);
-                if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
+                //if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
                if (scale < 1.0) {
                    width = Math.max(1, (int) (w * scale));
                    height = Math.max(1, (int) (h * scale));
@ -172,7 +173,7 @@ public class ViewImage {
                scaled = image;
            }

-            if ((height == 16) && (width == 16) && (resource != null)) {
+            if ((height == 16) && (width == 16) && (resourceb != null)) {
                // this might be a favicon, store image to cache for faster re-load later on
                iconcache.put(urlString, scaled);
            }
--- a/source/de/anomic/http/client/Cache.java
+++ b/source/de/anomic/http/client/Cache.java
@ -34,10 +34,8 @@

 package de.anomic.http.client;

-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.HashMap;
 import java.util.Map;
@ -195,12 +193,14 @@ public final class Cache {
     * is returned.
     * @throws IOException 
     */
+    /*
    public static InputStream getContentStream(final DigestURI url) throws IOException {
        // load the url as resource from the cache
        byte[] b = getContent(url);
        if (b == null) return null;
        return new ByteArrayInputStream(b);
    }
+    */
    
    /**
     * Returns the content of a cached resource as byte[]
@ -228,6 +228,7 @@ public final class Cache {
     * @param url
     * @return the size of the cached content
     */
+    /*
    public static long getResourceContentLength(final DigestURI url) {
        // first try to get the length from the response header,
        // this is less costly than loading the content from its gzipped cache
@ -245,7 +246,8 @@ public final class Cache {
            return -1;
        }
    }
-
+     */
+    
    /**
     * removed response header and cached content from the database
     * @param url
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@ -495,7 +495,11 @@ public final class HTTPDProxyHandler {
                long sizeBeforeDelete = -1;
                if (cachedResponseHeader != null) {
                    // delete the cache
-                    sizeBeforeDelete = Cache.getResourceContentLength(url);
+                    ResponseHeader rh = Cache.getResponseHeader(url);
+                    if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
+                        byte[] b = Cache.getContent(url);
+                        if (b != null) sizeBeforeDelete = b.length;
+                    }
                    Cache.delete(url);
                    conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                }
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@ -26,6 +26,7 @@

 package de.anomic.search;

+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@ -374,22 +375,22 @@ public class Segment {
        InputStream resourceContent = null;
        try {
            // get the resource content
-            Object[] resource = null;
+            byte[] resourceb = null;
            try {
-                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
+                resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
            } catch (IOException e) {
                Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
            }
-            if (resource == null) {
+            if (resourceb == null) {
                // delete just the url entry
                urlMetadata().remove(urlhash);
                return 0;
            } else {
-                resourceContent = (InputStream) resource[0];
-                final Long resourceContentLength = (Long) resource[1];
+                resourceContent = new ByteArrayInputStream(resourceb);
+                final long resourceContentLength = resourceb.length;
                
                // parse the resource
-                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
+                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null);
                
                // get the word set
                Set<String> words = null;
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -26,7 +26,6 @@ package de.anomic.search;

 import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Set;
@ -329,8 +328,7 @@ public class TextSnippet {
         * LOADING RESOURCE DATA
         * =========================================================================== */
        // if the snippet is not in the cache, we can try to get it from the htcache
-        long resContentLength = 0;
-        InputStream resContent = null;
+        byte[] resContent = null;
        ResponseHeader responseHeader = null;
        try {
            // first try to get the snippet from metadata
@ -349,11 +347,11 @@ public class TextSnippet {
                return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
            } else {
                // trying to load the resource from the cache
-                resContent = Cache.getContentStream(url);
+                resContent = Cache.getContent(url);
                responseHeader = Cache.getResponseHeader(url);
-                if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
+                if (resContent != null && !fetchOnline && resContent.length > maxDocLen) {
                    // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
-                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
+                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContent.length + " bytes");
                } else if (fetchOnline) {
                    // if not found try to download it
                    
@ -368,11 +366,9 @@ public class TextSnippet {
                        // read resource body (if it is there)
                        final byte[] resourceArray = entry.getContent();
                        if (resourceArray != null) {
-                            resContent = new ByteArrayInputStream(resourceArray);
-                            resContentLength = resourceArray.length;
+                            resContent = resourceArray;
                        } else {
-                            resContent = Cache.getContentStream(url); 
-                            resContentLength = Cache.getResourceContentLength(url);
+                            resContent = Cache.getContent(url); 
                        }
                    }
                    
@ -394,11 +390,11 @@ public class TextSnippet {
         * =========================================================================== */
        Document document = null;
        try {
-             document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader);
+             document = LoaderDispatcher.parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
        } catch (final ParserException e) {
            return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
        } finally {
-            try { resContent.close(); } catch (final Exception e) {/* ignore this */}
+            resContent = null;
        }
        if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
        
--- a/source/de/anomic/yacy/graphics/OSMTile.java
+++ b/source/de/anomic/yacy/graphics/OSMTile.java
@ -30,7 +30,6 @@ import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.EOFException;
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Random;

@ -81,13 +80,13 @@ public class OSMTile {
            return null;
        }
        //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
-        InputStream tileStream = null;
+        byte[] tileb = null;
        try {
-            tileStream = Cache.getContentStream(tileURL);
+            tileb = Cache.getContent(tileURL);
        } catch (IOException e1) {
            Log.logException(e1);
        }
-        if (tileStream == null) {
+        if (tileb == null) {
            // download resource using the crawler and keep resource in memory if possible
            Response entry = null;
            try {
@ -96,11 +95,11 @@ public class OSMTile {
                Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
                return null;
            }
-            if ((entry == null) || (entry.getContent() == null)) return null;
-            tileStream = new ByteArrayInputStream(entry.getContent());
+            tileb = entry.getContent();
+            if (entry == null) return null;
        }
        try {
-            return ImageIO.read(tileStream);
+            return ImageIO.read(new ByteArrayInputStream(tileb));
        } catch (final EOFException e) {
            return null;
        } catch (final IOException e) {
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -273,48 +273,26 @@ public final class LoaderDispatcher {
    }

    /**
-     * 
+     * load the url as resource from the web or the cache
     * @param url
     * @param fetchOnline
     * @param socketTimeout
     * @param forText 
-     * @return an Object array containing
-     * <table>
-     * <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
-     * <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
-     * </table>
+     * @return the content as {@link byte[]}
     * @throws IOException 
     */
-    public Object[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
-        // load the url as resource from the web
-        long contentLength = -1;
-            
-        // trying to load the resource body from cache
-        InputStream resource = Cache.getContentStream(url);
-        if (resource != null) {
-            contentLength = Cache.getResourceContentLength(url);
-        } else if (fetchOnline) {
-            // if the content is not available in cache try to download it from web
-            
-            // try to download the resource using the loader
-            final Response entry = load(url, forText, reindexing);
-            if (entry == null) return null; // not found in web
-            
-            // read resource body (if it is there)
-            final byte[] resourceArray = entry.getContent();
+    public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
+        byte[] resource = Cache.getContent(url);
+        if (resource != null) return resource;
        
-            // in case that the resource was not in ram, read it from disk
-            if (resourceArray == null) {
-                resource = Cache.getContentStream(url);   
-                contentLength = Cache.getResourceContentLength(url); 
-            } else {
-                resource = new ByteArrayInputStream(resourceArray);
-                contentLength = resourceArray.length;
-            }
-        } else {
-            return null;
-        }
-        return new Object[]{resource, Long.valueOf(contentLength)};
+        if (!fetchOnline) return null;
+        
+        // try to download the resource using the loader
+        final Response entry = load(url, forText, reindexing);
+        if (entry == null) return null; // not found in web
+        
+        // read resource body (if it is there)
+        return entry.getContent();
    }
    
    /**
@ -332,16 +310,14 @@ public final class LoaderDispatcher {
    public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) {

        // load resource
-        long resContentLength = 0;
-        InputStream resContent = null;
+        byte[] resContent = null;
        ResponseHeader responseHeader = null;
        try {
            // trying to load the resource from the cache
-            resContent = Cache.getContentStream(url);
+            resContent = Cache.getContent(url);
            responseHeader = Cache.getResponseHeader(url);
            if (resContent != null) {
                // if the content was found
-                resContentLength = Cache.getResourceContentLength(url);
            } else if (fetchOnline) {
                // if not found try to download it
                
@ -354,11 +330,9 @@ public final class LoaderDispatcher {
                    // read resource body (if it is there)
                    final byte[] resourceArray = entry.getContent();
                    if (resourceArray != null) {
-                        resContent = new ByteArrayInputStream(resourceArray);
-                        resContentLength = resourceArray.length;
+                        resContent = resourceArray;
                    } else {
-                        resContent = Cache.getContentStream(url); 
-                        resContentLength = Cache.getResourceContentLength(url);
+                        resContent = Cache.getContent(url); 
                    }
                }
                
@ -379,12 +353,12 @@ public final class LoaderDispatcher {
        // parse resource
        Document document = null;
        try {
-            document = parseDocument(url, resContentLength, resContent, responseHeader);            
+            document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);            
        } catch (final ParserException e) {
            Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url);
            return null;
        } finally {
-            try { resContent.close(); } catch (final Exception e) {}
+            resContent = null;
        }
        return document;
    }