From fe41a843303f29891c50427e21d14042c45179d2 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 19 Nov 2009 10:17:26 +0000 Subject: [PATCH] some enhancements in web caching: avoid double loading of response metadata and/or content git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6491 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/ViewFile.java | 35 +++------- htroot/ViewImage.java | 19 +++--- source/de/anomic/http/client/Cache.java | 8 ++- .../anomic/http/server/HTTPDProxyHandler.java | 6 +- source/de/anomic/search/Segment.java | 13 ++-- source/de/anomic/search/TextSnippet.java | 20 +++--- source/de/anomic/yacy/graphics/OSMTile.java | 13 ++-- .../net/yacy/repository/LoaderDispatcher.java | 64 ++++++------------- 8 files changed, 68 insertions(+), 110 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index ee5e3bf33..c3c1145af 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -25,8 +25,8 @@ //javac -classpath .:../Classes Status.java //if the shell's current path is HTROOT +import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLDecoder; @@ -43,7 +43,6 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; @@ -158,18 +157,16 @@ public class ViewFile { } // loading the resource content as byte array - InputStream resource = null; - long resourceLength = -1; + byte[] resource = null; ResponseHeader responseHeader = null; String resMime = null; // trying to load the resource body try { - resource = Cache.getContentStream(url); + resource = Cache.getContent(url); } catch (IOException e) { Log.logException(e); resource = null; } - resourceLength = Cache.getResourceContentLength(url); responseHeader = Cache.getResponseHeader(url); // if the resource body was not cached we try to load it from web @@ -185,13 +182,7 @@ public class ViewFile { } if (entry != null) { - try { - resource = Cache.getContentStream(url); - } catch (IOException e) { - Log.logException(e); - resource = null; - } - resourceLength = Cache.getResourceContentLength(url); + resource = entry.getContent(); } if (resource == null) { @@ -241,19 +232,14 @@ public class ViewFile { // TODO: how to handle very large files here ? String content; try { - content = new String(FileUtils.read(resource), "UTF-8"); + content = new String(resource, "UTF-8"); } catch (final Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; } finally { - if (resource != null) - try { - resource.close(); - } catch (final Exception e) { - /* ignore this */ - } + resource = null; } prop.put("error", "0"); @@ -268,7 +254,7 @@ public class ViewFile { // parsing the resource content Document document = null; try { - document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null); + document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), null); if (document == null) { prop.put("error", "5"); prop.put("error_errorText", "Unknown error"); @@ -281,12 +267,7 @@ public class ViewFile { prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; } finally { - if (resource != null) - try { - resource.close(); - } catch (final Exception e) { - /* ignore this */ - } + resource = null; } resMime = document.dc_format(); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 21732e8d8..dbf42e51d 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -23,6 +23,7 @@ import java.awt.Container; import java.awt.Image; import java.awt.MediaTracker; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -84,17 +85,17 @@ public class ViewImage { int maxheight = post.getInt("maxheight", 0); final int timeout = post.getInt("timeout", 5000); - // getting the image as stream + // get the image as stream Image scaled = iconcache.get(urlString); if (scaled == null) { - Object[] resource = null; + byte[] resourceb = null; if (url != null) try { - resource = sb.loader.getResource(url, true, timeout, false, true); + resourceb = sb.loader.getResource(url, true, timeout, false, true); } catch (IOException e) { Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); } byte[] imgb = null; - if (resource == null) { + if (resourceb == null) { if (urlString.endsWith(".ico")) { // load default favicon dfltfvcn.ico if (defaulticonb == null) try { @@ -108,7 +109,7 @@ public class ViewImage { return null; } } else { - final InputStream imgStream = (InputStream) resource[0]; + final InputStream imgStream = new ByteArrayInputStream(resourceb); if (imgStream == null) return null; // read image data @@ -138,8 +139,8 @@ public class ViewImage { maxwidth = (maxwidth == 0) ? w : maxwidth; maxheight = (maxheight == 0) ? h : maxheight; } else if ((w > 16) || (h > 16)) { - maxwidth = (int) Math.min(64.0, w * 0.6); - maxheight = (int) Math.min(64.0, h * 0.6); + maxwidth = Math.min(96, w); + maxheight = Math.min(96, h); } else { maxwidth = 16; maxheight = 16; @@ -151,7 +152,7 @@ public class ViewImage { final double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w); final double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h); double scale = Math.min(hs, vs); - if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose + //if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose if (scale < 1.0) { width = Math.max(1, (int) (w * scale)); height = Math.max(1, (int) (h * scale)); @@ -172,7 +173,7 @@ public class ViewImage { scaled = image; } - if ((height == 16) && (width == 16) && (resource != null)) { + if ((height == 16) && (width == 16) && (resourceb != null)) { // this might be a favicon, store image to cache for faster re-load later on iconcache.put(urlString, scaled); } diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index 83b61df59..e013ad4c2 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -34,10 +34,8 @@ package de.anomic.http.client; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; @@ -195,12 +193,14 @@ public final class Cache { * is returned. * @throws IOException */ + /* public static InputStream getContentStream(final DigestURI url) throws IOException { // load the url as resource from the cache byte[] b = getContent(url); if (b == null) return null; return new ByteArrayInputStream(b); } + */ /** * Returns the content of a cached resource as byte[] @@ -228,6 +228,7 @@ public final class Cache { * @param url * @return the size of the cached content */ + /* public static long getResourceContentLength(final DigestURI url) { // first try to get the length from the response header, // this is less costly than loading the content from its gzipped cache @@ -245,7 +246,8 @@ public final class Cache { return -1; } } - + */ + /** * removed response header and cached content from the database * @param url diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 09b81b0f3..3857d1fc8 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -495,7 +495,11 @@ public final class HTTPDProxyHandler { long sizeBeforeDelete = -1; if (cachedResponseHeader != null) { // delete the cache - sizeBeforeDelete = Cache.getResourceContentLength(url); + ResponseHeader rh = Cache.getResponseHeader(url); + if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) { + byte[] b = Cache.getContent(url); + if (b != null) sizeBeforeDelete = b.length; + } Cache.delete(url); conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); } diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 9404de4ff..e3e8a712a 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -26,6 +26,7 @@ package de.anomic.search; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -374,22 +375,22 @@ public class Segment { InputStream resourceContent = null; try { // get the resource content - Object[] resource = null; + byte[] resourceb = null; try { - resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); + resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); } catch (IOException e) { Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); } - if (resource == null) { + if (resourceb == null) { // delete just the url entry urlMetadata().remove(urlhash); return 0; } else { - resourceContent = (InputStream) resource[0]; - final Long resourceContentLength = (Long) resource[1]; + resourceContent = new ByteArrayInputStream(resourceb); + final long resourceContentLength = resourceb.length; // parse the resource - final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null); + final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null); // get the word set Set words = null; diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 52f34c26f..93cff50be 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -26,7 +26,6 @@ package de.anomic.search; import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStream; import java.util.ArrayList; import java.util.Iterator; import java.util.Set; @@ -329,8 +328,7 @@ public class TextSnippet { * LOADING RESOURCE DATA * =========================================================================== */ // if the snippet is not in the cache, we can try to get it from the htcache - long resContentLength = 0; - InputStream resContent = null; + byte[] resContent = null; ResponseHeader responseHeader = null; try { // first try to get the snippet from metadata @@ -349,11 +347,11 @@ public class TextSnippet { return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); } else { // trying to load the resource from the cache - resContent = Cache.getContentStream(url); + resContent = Cache.getContent(url); responseHeader = Cache.getResponseHeader(url); - if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) { + if (resContent != null && !fetchOnline && resContent.length > maxDocLen) { // content may be too large to be parsed here. To be fast, we omit calculation of snippet here - return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); + return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContent.length + " bytes"); } else if (fetchOnline) { // if not found try to download it @@ -368,11 +366,9 @@ public class TextSnippet { // read resource body (if it is there) final byte[] resourceArray = entry.getContent(); if (resourceArray != null) { - resContent = new ByteArrayInputStream(resourceArray); - resContentLength = resourceArray.length; + resContent = resourceArray; } else { - resContent = Cache.getContentStream(url); - resContentLength = Cache.getResourceContentLength(url); + resContent = Cache.getContent(url); } } @@ -394,11 +390,11 @@ public class TextSnippet { * =========================================================================== */ Document document = null; try { - document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader); + document = LoaderDispatcher.parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader); } catch (final ParserException e) { return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed } finally { - try { resContent.close(); } catch (final Exception e) {/* ignore this */} + resContent = null; } if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java index a42774c5f..1cfcc0ea5 100644 --- a/source/de/anomic/yacy/graphics/OSMTile.java +++ b/source/de/anomic/yacy/graphics/OSMTile.java @@ -30,7 +30,6 @@ import java.awt.image.BufferedImage; import java.io.ByteArrayInputStream; import java.io.EOFException; import java.io.IOException; -import java.io.InputStream; import java.net.MalformedURLException; import java.util.Random; @@ -81,13 +80,13 @@ public class OSMTile { return null; } //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true)); - InputStream tileStream = null; + byte[] tileb = null; try { - tileStream = Cache.getContentStream(tileURL); + tileb = Cache.getContent(tileURL); } catch (IOException e1) { Log.logException(e1); } - if (tileStream == null) { + if (tileb == null) { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { @@ -96,11 +95,11 @@ public class OSMTile { Log.logWarning("yamyOSM", "cannot load: " + e.getMessage()); return null; } - if ((entry == null) || (entry.getContent() == null)) return null; - tileStream = new ByteArrayInputStream(entry.getContent()); + tileb = entry.getContent(); + if (entry == null) return null; } try { - return ImageIO.read(tileStream); + return ImageIO.read(new ByteArrayInputStream(tileb)); } catch (final EOFException e) { return null; } catch (final IOException e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 4f3900c1a..ea22043a8 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -273,48 +273,26 @@ public final class LoaderDispatcher { } /** - * + * load the url as resource from the web or the cache * @param url * @param fetchOnline * @param socketTimeout * @param forText - * @return an Object array containing - * - * - * - *
[0]the content as {@link InputStream}
[1]the content-length as {@link Integer}
+ * @return the content as {@link byte[]} * @throws IOException */ - public Object[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { - // load the url as resource from the web - long contentLength = -1; - - // trying to load the resource body from cache - InputStream resource = Cache.getContentStream(url); - if (resource != null) { - contentLength = Cache.getResourceContentLength(url); - } else if (fetchOnline) { - // if the content is not available in cache try to download it from web - - // try to download the resource using the loader - final Response entry = load(url, forText, reindexing); - if (entry == null) return null; // not found in web - - // read resource body (if it is there) - final byte[] resourceArray = entry.getContent(); + public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { + byte[] resource = Cache.getContent(url); + if (resource != null) return resource; - // in case that the resource was not in ram, read it from disk - if (resourceArray == null) { - resource = Cache.getContentStream(url); - contentLength = Cache.getResourceContentLength(url); - } else { - resource = new ByteArrayInputStream(resourceArray); - contentLength = resourceArray.length; - } - } else { - return null; - } - return new Object[]{resource, Long.valueOf(contentLength)}; + if (!fetchOnline) return null; + + // try to download the resource using the loader + final Response entry = load(url, forText, reindexing); + if (entry == null) return null; // not found in web + + // read resource body (if it is there) + return entry.getContent(); } /** @@ -332,16 +310,14 @@ public final class LoaderDispatcher { public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) { // load resource - long resContentLength = 0; - InputStream resContent = null; + byte[] resContent = null; ResponseHeader responseHeader = null; try { // trying to load the resource from the cache - resContent = Cache.getContentStream(url); + resContent = Cache.getContent(url); responseHeader = Cache.getResponseHeader(url); if (resContent != null) { // if the content was found - resContentLength = Cache.getResourceContentLength(url); } else if (fetchOnline) { // if not found try to download it @@ -354,11 +330,9 @@ public final class LoaderDispatcher { // read resource body (if it is there) final byte[] resourceArray = entry.getContent(); if (resourceArray != null) { - resContent = new ByteArrayInputStream(resourceArray); - resContentLength = resourceArray.length; + resContent = resourceArray; } else { - resContent = Cache.getContentStream(url); - resContentLength = Cache.getResourceContentLength(url); + resContent = Cache.getContent(url); } } @@ -379,12 +353,12 @@ public final class LoaderDispatcher { // parse resource Document document = null; try { - document = parseDocument(url, resContentLength, resContent, responseHeader); + document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader); } catch (final ParserException e) { Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url); return null; } finally { - try { resContent.close(); } catch (final Exception e) {} + resContent = null; } return document; }