From fe41a843303f29891c50427e21d14042c45179d2 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 19 Nov 2009 10:17:26 +0000
Subject: [PATCH] some enhancements in web caching: avoid double loading of
 response metadata and/or content

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6491 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 htroot/ViewFile.java                          | 35 +++-------
 htroot/ViewImage.java                         | 19 +++---
 source/de/anomic/http/client/Cache.java       |  8 ++-
 .../anomic/http/server/HTTPDProxyHandler.java |  6 +-
 source/de/anomic/search/Segment.java          | 13 ++--
 source/de/anomic/search/TextSnippet.java      | 20 +++---
 source/de/anomic/yacy/graphics/OSMTile.java   | 13 ++--
 .../net/yacy/repository/LoaderDispatcher.java | 64 ++++++-------------
 8 files changed, 68 insertions(+), 110 deletions(-)

diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index ee5e3bf33..c3c1145af 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -25,8 +25,8 @@
 //javac -classpath .:../Classes Status.java
 //if the shell's current path is HTROOT
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URLDecoder;
@@ -43,7 +43,6 @@ import net.yacy.document.parser.html.ImageEntry;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
 import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.LoaderDispatcher;
 
 import de.anomic.crawler.retrieval.Response;
@@ -158,18 +157,16 @@ public class ViewFile {
         }
 
         // loading the resource content as byte array
-        InputStream resource = null;
-        long resourceLength = -1;
+        byte[] resource = null;
         ResponseHeader responseHeader = null;
         String resMime = null;
         // trying to load the resource body
         try {
-            resource = Cache.getContentStream(url);
+            resource = Cache.getContent(url);
         } catch (IOException e) {
             Log.logException(e);
             resource = null;
         }
-        resourceLength = Cache.getResourceContentLength(url);
         responseHeader = Cache.getResponseHeader(url);
 
         // if the resource body was not cached we try to load it from web
@@ -185,13 +182,7 @@ public class ViewFile {
             }
 
             if (entry != null) {
-                try {
-                    resource = Cache.getContentStream(url);
-                } catch (IOException e) {
-                    Log.logException(e);
-                    resource = null;
-                }
-                resourceLength = Cache.getResourceContentLength(url);
+                resource = entry.getContent();
             }
 
             if (resource == null) {
@@ -241,19 +232,14 @@ public class ViewFile {
             // TODO: how to handle very large files here ?
             String content;
             try {
-                content = new String(FileUtils.read(resource), "UTF-8");
+                content = new String(resource, "UTF-8");
             } catch (final Exception e) {
                 prop.put("error", "4");
                 prop.putHTML("error_errorText", e.getMessage());
                 prop.put("viewMode", VIEW_MODE_NO_TEXT);
                 return prop;
             } finally {
-                if (resource != null)
-                    try {
-                        resource.close();
-                    } catch (final Exception e) {
-                        /* ignore this */
-                    }
+                resource = null;
             }
 
             prop.put("error", "0");
@@ -268,7 +254,7 @@ public class ViewFile {
             // parsing the resource content
             Document document = null;
             try {
-                document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null);
+                document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), null);
                 if (document == null) {
                     prop.put("error", "5");
                     prop.put("error_errorText", "Unknown error");
@@ -281,12 +267,7 @@ public class ViewFile {
                 prop.put("viewMode", VIEW_MODE_NO_TEXT);
                 return prop;
             } finally {
-                if (resource != null)
-                    try {
-                        resource.close();
-                    } catch (final Exception e) {
-                        /* ignore this */
-                    }
+                resource = null;
             }
 
             resMime = document.dc_format();
diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java
index 21732e8d8..dbf42e51d 100644
--- a/htroot/ViewImage.java
+++ b/htroot/ViewImage.java
@@ -23,6 +23,7 @@
 import java.awt.Container;
 import java.awt.Image;
 import java.awt.MediaTracker;
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -84,17 +85,17 @@ public class ViewImage {
         int maxheight = post.getInt("maxheight", 0);
         final int timeout = post.getInt("timeout", 5000);
         
-        // getting the image as stream
+        // get the image as stream
         Image scaled = iconcache.get(urlString);
         if (scaled == null) {
-            Object[] resource = null;
+            byte[] resourceb = null;
             if (url != null) try {
-                resource = sb.loader.getResource(url, true, timeout, false, true);
+                resourceb = sb.loader.getResource(url, true, timeout, false, true);
             } catch (IOException e) {
                 Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
             }
             byte[] imgb = null;
-            if (resource == null) {
+            if (resourceb == null) {
                 if (urlString.endsWith(".ico")) {
                     // load default favicon dfltfvcn.ico
                     if (defaulticonb == null) try {
@@ -108,7 +109,7 @@ public class ViewImage {
                     return null;
                 }
             } else {
-                final InputStream imgStream = (InputStream) resource[0];
+                final InputStream imgStream = new ByteArrayInputStream(resourceb);
                 if (imgStream == null) return null;
 
                 // read image data
@@ -138,8 +139,8 @@ public class ViewImage {
                 maxwidth = (maxwidth == 0) ? w : maxwidth;
                 maxheight = (maxheight == 0) ? h : maxheight;
             } else if ((w > 16) || (h > 16)) {
-                maxwidth = (int) Math.min(64.0, w * 0.6);
-                maxheight = (int) Math.min(64.0, h * 0.6);
+                maxwidth = Math.min(96, w);
+                maxheight = Math.min(96, h);
             } else {
                 maxwidth = 16;
                 maxheight = 16;
@@ -151,7 +152,7 @@ public class ViewImage {
                 final double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
                 final double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
                 double scale = Math.min(hs, vs);
-                if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
+                //if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
                 if (scale < 1.0) {
                     width = Math.max(1, (int) (w * scale));
                     height = Math.max(1, (int) (h * scale));
@@ -172,7 +173,7 @@ public class ViewImage {
                 scaled = image;
             }
 
-            if ((height == 16) && (width == 16) && (resource != null)) {
+            if ((height == 16) && (width == 16) && (resourceb != null)) {
                 // this might be a favicon, store image to cache for faster re-load later on
                 iconcache.put(urlString, scaled);
             }
diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java
index 83b61df59..e013ad4c2 100644
--- a/source/de/anomic/http/client/Cache.java
+++ b/source/de/anomic/http/client/Cache.java
@@ -34,10 +34,8 @@
 
 package de.anomic.http.client;
 
-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
-import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.util.HashMap;
 import java.util.Map;
@@ -195,12 +193,14 @@ public final class Cache {
      * is returned.
      * @throws IOException 
      */
+    /*
     public static InputStream getContentStream(final DigestURI url) throws IOException {
         // load the url as resource from the cache
         byte[] b = getContent(url);
         if (b == null) return null;
         return new ByteArrayInputStream(b);
     }
+    */
     
     /**
      * Returns the content of a cached resource as byte[]
@@ -228,6 +228,7 @@ public final class Cache {
      * @param url
      * @return the size of the cached content
      */
+    /*
     public static long getResourceContentLength(final DigestURI url) {
         // first try to get the length from the response header,
         // this is less costly than loading the content from its gzipped cache
@@ -245,7 +246,8 @@ public final class Cache {
             return -1;
         }
     }
-
+     */
+    
     /**
      * removed response header and cached content from the database
      * @param url
diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java
index 09b81b0f3..3857d1fc8 100644
--- a/source/de/anomic/http/server/HTTPDProxyHandler.java
+++ b/source/de/anomic/http/server/HTTPDProxyHandler.java
@@ -495,7 +495,11 @@ public final class HTTPDProxyHandler {
                 long sizeBeforeDelete = -1;
                 if (cachedResponseHeader != null) {
                     // delete the cache
-                    sizeBeforeDelete = Cache.getResourceContentLength(url);
+                    ResponseHeader rh = Cache.getResponseHeader(url);
+                    if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
+                        byte[] b = Cache.getContent(url);
+                        if (b != null) sizeBeforeDelete = b.length;
+                    }
                     Cache.delete(url);
                     conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
                 }
diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java
index 9404de4ff..e3e8a712a 100644
--- a/source/de/anomic/search/Segment.java
+++ b/source/de/anomic/search/Segment.java
@@ -26,6 +26,7 @@
 
 package de.anomic.search;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
@@ -374,22 +375,22 @@ public class Segment {
         InputStream resourceContent = null;
         try {
             // get the resource content
-            Object[] resource = null;
+            byte[] resourceb = null;
             try {
-                resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
+                resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
             } catch (IOException e) {
                 Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
             }
-            if (resource == null) {
+            if (resourceb == null) {
                 // delete just the url entry
                 urlMetadata().remove(urlhash);
                 return 0;
             } else {
-                resourceContent = (InputStream) resource[0];
-                final Long resourceContentLength = (Long) resource[1];
+                resourceContent = new ByteArrayInputStream(resourceb);
+                final long resourceContentLength = resourceb.length;
                 
                 // parse the resource
-                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
+                final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null);
                 
                 // get the word set
                 Set<String> words = null;
diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java
index 52f34c26f..93cff50be 100644
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@@ -26,7 +26,6 @@ package de.anomic.search;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.Set;
@@ -329,8 +328,7 @@ public class TextSnippet {
          * LOADING RESOURCE DATA
          * =========================================================================== */
         // if the snippet is not in the cache, we can try to get it from the htcache
-        long resContentLength = 0;
-        InputStream resContent = null;
+        byte[] resContent = null;
         ResponseHeader responseHeader = null;
         try {
             // first try to get the snippet from metadata
@@ -349,11 +347,11 @@ public class TextSnippet {
                 return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
             } else {
                 // trying to load the resource from the cache
-                resContent = Cache.getContentStream(url);
+                resContent = Cache.getContent(url);
                 responseHeader = Cache.getResponseHeader(url);
-                if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
+                if (resContent != null && !fetchOnline && resContent.length > maxDocLen) {
                     // content may be too large to be parsed here. To be fast, we omit calculation of snippet here
-                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
+                    return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContent.length + " bytes");
                 } else if (fetchOnline) {
                     // if not found try to download it
                     
@@ -368,11 +366,9 @@ public class TextSnippet {
                         // read resource body (if it is there)
                         final byte[] resourceArray = entry.getContent();
                         if (resourceArray != null) {
-                            resContent = new ByteArrayInputStream(resourceArray);
-                            resContentLength = resourceArray.length;
+                            resContent = resourceArray;
                         } else {
-                            resContent = Cache.getContentStream(url); 
-                            resContentLength = Cache.getResourceContentLength(url);
+                            resContent = Cache.getContent(url); 
                         }
                     }
                     
@@ -394,11 +390,11 @@ public class TextSnippet {
          * =========================================================================== */
         Document document = null;
         try {
-             document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader);
+             document = LoaderDispatcher.parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
         } catch (final ParserException e) {
             return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
         } finally {
-            try { resContent.close(); } catch (final Exception e) {/* ignore this */}
+            resContent = null;
         }
         if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed
         
diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java
index a42774c5f..1cfcc0ea5 100644
--- a/source/de/anomic/yacy/graphics/OSMTile.java
+++ b/source/de/anomic/yacy/graphics/OSMTile.java
@@ -30,7 +30,6 @@ import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.EOFException;
 import java.io.IOException;
-import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Random;
 
@@ -81,13 +80,13 @@ public class OSMTile {
             return null;
         }
         //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
-        InputStream tileStream = null;
+        byte[] tileb = null;
         try {
-            tileStream = Cache.getContentStream(tileURL);
+            tileb = Cache.getContent(tileURL);
         } catch (IOException e1) {
             Log.logException(e1);
         }
-        if (tileStream == null) {
+        if (tileb == null) {
             // download resource using the crawler and keep resource in memory if possible
             Response entry = null;
             try {
@@ -96,11 +95,11 @@ public class OSMTile {
                 Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
                 return null;
             }
-            if ((entry == null) || (entry.getContent() == null)) return null;
-            tileStream = new ByteArrayInputStream(entry.getContent());
+            tileb = entry.getContent();
+            if (entry == null) return null;
         }
         try {
-            return ImageIO.read(tileStream);
+            return ImageIO.read(new ByteArrayInputStream(tileb));
         } catch (final EOFException e) {
             return null;
         } catch (final IOException e) {
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 4f3900c1a..ea22043a8 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -273,48 +273,26 @@ public final class LoaderDispatcher {
     }
 
     /**
-     * 
+     * load the url as resource from the web or the cache
      * @param url
      * @param fetchOnline
      * @param socketTimeout
      * @param forText 
-     * @return an Object array containing
-     * <table>
-     * <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
-     * <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
-     * </table>
+     * @return the content as {@link byte[]}
      * @throws IOException 
      */
-    public Object[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
-        // load the url as resource from the web
-        long contentLength = -1;
-            
-        // trying to load the resource body from cache
-        InputStream resource = Cache.getContentStream(url);
-        if (resource != null) {
-            contentLength = Cache.getResourceContentLength(url);
-        } else if (fetchOnline) {
-            // if the content is not available in cache try to download it from web
-            
-            // try to download the resource using the loader
-            final Response entry = load(url, forText, reindexing);
-            if (entry == null) return null; // not found in web
-            
-            // read resource body (if it is there)
-            final byte[] resourceArray = entry.getContent();
+    public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
+        byte[] resource = Cache.getContent(url);
+        if (resource != null) return resource;
         
-            // in case that the resource was not in ram, read it from disk
-            if (resourceArray == null) {
-                resource = Cache.getContentStream(url);   
-                contentLength = Cache.getResourceContentLength(url); 
-            } else {
-                resource = new ByteArrayInputStream(resourceArray);
-                contentLength = resourceArray.length;
-            }
-        } else {
-            return null;
-        }
-        return new Object[]{resource, Long.valueOf(contentLength)};
+        if (!fetchOnline) return null;
+        
+        // try to download the resource using the loader
+        final Response entry = load(url, forText, reindexing);
+        if (entry == null) return null; // not found in web
+        
+        // read resource body (if it is there)
+        return entry.getContent();
     }
     
     /**
@@ -332,16 +310,14 @@ public final class LoaderDispatcher {
     public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) {
 
         // load resource
-        long resContentLength = 0;
-        InputStream resContent = null;
+        byte[] resContent = null;
         ResponseHeader responseHeader = null;
         try {
             // trying to load the resource from the cache
-            resContent = Cache.getContentStream(url);
+            resContent = Cache.getContent(url);
             responseHeader = Cache.getResponseHeader(url);
             if (resContent != null) {
                 // if the content was found
-                resContentLength = Cache.getResourceContentLength(url);
             } else if (fetchOnline) {
                 // if not found try to download it
                 
@@ -354,11 +330,9 @@ public final class LoaderDispatcher {
                     // read resource body (if it is there)
                     final byte[] resourceArray = entry.getContent();
                     if (resourceArray != null) {
-                        resContent = new ByteArrayInputStream(resourceArray);
-                        resContentLength = resourceArray.length;
+                        resContent = resourceArray;
                     } else {
-                        resContent = Cache.getContentStream(url); 
-                        resContentLength = Cache.getResourceContentLength(url);
+                        resContent = Cache.getContent(url); 
                     }
                 }
                 
@@ -379,12 +353,12 @@ public final class LoaderDispatcher {
         // parse resource
         Document document = null;
         try {
-            document = parseDocument(url, resContentLength, resContent, responseHeader);            
+            document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);            
         } catch (final ParserException e) {
             Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url);
             return null;
         } finally {
-            try { resContent.close(); } catch (final Exception e) {}
+            resContent = null;
         }
         return document;
     }