some enhancements in web caching: avoid double loading of response metadata and/or content

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6491 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 06d0dcde20
commit fe41a84330

@ -25,8 +25,8 @@
//javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLDecoder;
@ -43,7 +43,6 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
@ -158,18 +157,16 @@ public class ViewFile {
}
// loading the resource content as byte array
InputStream resource = null;
long resourceLength = -1;
byte[] resource = null;
ResponseHeader responseHeader = null;
String resMime = null;
// trying to load the resource body
try {
resource = Cache.getContentStream(url);
resource = Cache.getContent(url);
} catch (IOException e) {
Log.logException(e);
resource = null;
}
resourceLength = Cache.getResourceContentLength(url);
responseHeader = Cache.getResponseHeader(url);
// if the resource body was not cached we try to load it from web
@ -185,13 +182,7 @@ public class ViewFile {
}
if (entry != null) {
try {
resource = Cache.getContentStream(url);
} catch (IOException e) {
Log.logException(e);
resource = null;
}
resourceLength = Cache.getResourceContentLength(url);
resource = entry.getContent();
}
if (resource == null) {
@ -241,19 +232,14 @@ public class ViewFile {
// TODO: how to handle very large files here ?
String content;
try {
content = new String(FileUtils.read(resource), "UTF-8");
content = new String(resource, "UTF-8");
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null)
try {
resource.close();
} catch (final Exception e) {
/* ignore this */
}
resource = null;
}
prop.put("error", "0");
@ -268,7 +254,7 @@ public class ViewFile {
// parsing the resource content
Document document = null;
try {
document = LoaderDispatcher.parseDocument(url, resourceLength, resource, null);
document = LoaderDispatcher.parseDocument(url, resource.length, new ByteArrayInputStream(resource), null);
if (document == null) {
prop.put("error", "5");
prop.put("error_errorText", "Unknown error");
@ -281,12 +267,7 @@ public class ViewFile {
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} finally {
if (resource != null)
try {
resource.close();
} catch (final Exception e) {
/* ignore this */
}
resource = null;
}
resMime = document.dc_format();

@ -23,6 +23,7 @@
import java.awt.Container;
import java.awt.Image;
import java.awt.MediaTracker;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@ -84,17 +85,17 @@ public class ViewImage {
int maxheight = post.getInt("maxheight", 0);
final int timeout = post.getInt("timeout", 5000);
// getting the image as stream
// get the image as stream
Image scaled = iconcache.get(urlString);
if (scaled == null) {
Object[] resource = null;
byte[] resourceb = null;
if (url != null) try {
resource = sb.loader.getResource(url, true, timeout, false, true);
resourceb = sb.loader.getResource(url, true, timeout, false, true);
} catch (IOException e) {
Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
}
byte[] imgb = null;
if (resource == null) {
if (resourceb == null) {
if (urlString.endsWith(".ico")) {
// load default favicon dfltfvcn.ico
if (defaulticonb == null) try {
@ -108,7 +109,7 @@ public class ViewImage {
return null;
}
} else {
final InputStream imgStream = (InputStream) resource[0];
final InputStream imgStream = new ByteArrayInputStream(resourceb);
if (imgStream == null) return null;
// read image data
@ -138,8 +139,8 @@ public class ViewImage {
maxwidth = (maxwidth == 0) ? w : maxwidth;
maxheight = (maxheight == 0) ? h : maxheight;
} else if ((w > 16) || (h > 16)) {
maxwidth = (int) Math.min(64.0, w * 0.6);
maxheight = (int) Math.min(64.0, h * 0.6);
maxwidth = Math.min(96, w);
maxheight = Math.min(96, h);
} else {
maxwidth = 16;
maxheight = 16;
@ -151,7 +152,7 @@ public class ViewImage {
final double hs = (w <= maxwidth) ? 1.0 : ((double) maxwidth) / ((double) w);
final double vs = (h <= maxheight) ? 1.0 : ((double) maxheight) / ((double) h);
double scale = Math.min(hs, vs);
if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
//if (!auth) scale = Math.min(scale, 0.6); // this is for copyright purpose
if (scale < 1.0) {
width = Math.max(1, (int) (w * scale));
height = Math.max(1, (int) (h * scale));
@ -172,7 +173,7 @@ public class ViewImage {
scaled = image;
}
if ((height == 16) && (width == 16) && (resource != null)) {
if ((height == 16) && (width == 16) && (resourceb != null)) {
// this might be a favicon, store image to cache for faster re-load later on
iconcache.put(urlString, scaled);
}

@ -34,10 +34,8 @@
package de.anomic.http.client;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;
@ -195,12 +193,14 @@ public final class Cache {
* is returned.
* @throws IOException
*/
/*
public static InputStream getContentStream(final DigestURI url) throws IOException {
// load the url as resource from the cache
byte[] b = getContent(url);
if (b == null) return null;
return new ByteArrayInputStream(b);
}
*/
/**
* Returns the content of a cached resource as byte[]
@ -228,6 +228,7 @@ public final class Cache {
* @param url
* @return the size of the cached content
*/
/*
public static long getResourceContentLength(final DigestURI url) {
// first try to get the length from the response header,
// this is less costly than loading the content from its gzipped cache
@ -245,7 +246,8 @@ public final class Cache {
return -1;
}
}
*/
/**
* removed response header and cached content from the database
* @param url

@ -495,7 +495,11 @@ public final class HTTPDProxyHandler {
long sizeBeforeDelete = -1;
if (cachedResponseHeader != null) {
// delete the cache
sizeBeforeDelete = Cache.getResourceContentLength(url);
ResponseHeader rh = Cache.getResponseHeader(url);
if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
byte[] b = Cache.getContent(url);
if (b != null) sizeBeforeDelete = b.length;
}
Cache.delete(url);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
}

@ -26,6 +26,7 @@
package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@ -374,22 +375,22 @@ public class Segment {
InputStream resourceContent = null;
try {
// get the resource content
Object[] resource = null;
byte[] resourceb = null;
try {
resource = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}
if (resource == null) {
if (resourceb == null) {
// delete just the url entry
urlMetadata().remove(urlhash);
return 0;
} else {
resourceContent = (InputStream) resource[0];
final Long resourceContentLength = (Long) resource[1];
resourceContent = new ByteArrayInputStream(resourceb);
final long resourceContentLength = resourceb.length;
// parse the resource
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength.longValue(), resourceContent, null);
final Document document = LoaderDispatcher.parseDocument(metadata.url(), resourceContentLength, resourceContent, null);
// get the word set
Set<String> words = null;

@ -26,7 +26,6 @@ package de.anomic.search;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
@ -329,8 +328,7 @@ public class TextSnippet {
* LOADING RESOURCE DATA
* =========================================================================== */
// if the snippet is not in the cache, we can try to get it from the htcache
long resContentLength = 0;
InputStream resContent = null;
byte[] resContent = null;
ResponseHeader responseHeader = null;
try {
// first try to get the snippet from metadata
@ -349,11 +347,11 @@ public class TextSnippet {
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else {
// trying to load the resource from the cache
resContent = Cache.getContentStream(url);
resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url);
if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
if (resContent != null && !fetchOnline && resContent.length > maxDocLen) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContent.length + " bytes");
} else if (fetchOnline) {
// if not found try to download it
@ -368,11 +366,9 @@ public class TextSnippet {
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
if (resourceArray != null) {
resContent = new ByteArrayInputStream(resourceArray);
resContentLength = resourceArray.length;
resContent = resourceArray;
} else {
resContent = Cache.getContentStream(url);
resContentLength = Cache.getResourceContentLength(url);
resContent = Cache.getContent(url);
}
}
@ -394,11 +390,11 @@ public class TextSnippet {
* =========================================================================== */
Document document = null;
try {
document = LoaderDispatcher.parseDocument(url, resContentLength, resContent, responseHeader);
document = LoaderDispatcher.parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
} catch (final ParserException e) {
return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, e.getMessage()); // cannot be parsed
} finally {
try { resContent.close(); } catch (final Exception e) {/* ignore this */}
resContent = null;
}
if (document == null) return new TextSnippet(url, null, ERROR_PARSER_FAILED, queryhashes, "parser error/failed"); // cannot be parsed

@ -30,7 +30,6 @@ import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Random;
@ -81,13 +80,13 @@ public class OSMTile {
return null;
}
//System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
InputStream tileStream = null;
byte[] tileb = null;
try {
tileStream = Cache.getContentStream(tileURL);
tileb = Cache.getContent(tileURL);
} catch (IOException e1) {
Log.logException(e1);
}
if (tileStream == null) {
if (tileb == null) {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
@ -96,11 +95,11 @@ public class OSMTile {
Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
return null;
}
if ((entry == null) || (entry.getContent() == null)) return null;
tileStream = new ByteArrayInputStream(entry.getContent());
tileb = entry.getContent();
if (entry == null) return null;
}
try {
return ImageIO.read(tileStream);
return ImageIO.read(new ByteArrayInputStream(tileb));
} catch (final EOFException e) {
return null;
} catch (final IOException e) {

@ -273,48 +273,26 @@ public final class LoaderDispatcher {
}
/**
*
* load the url as resource from the web or the cache
* @param url
* @param fetchOnline
* @param socketTimeout
* @param forText
* @return an Object array containing
* <table>
* <tr><td>[0]</td><td>the content as {@link InputStream}</td></tr>
* <tr><td>[1]</td><td>the content-length as {@link Integer}</td></tr>
* </table>
* @return the content as {@link byte[]}
* @throws IOException
*/
public Object[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
// load the url as resource from the web
long contentLength = -1;
// trying to load the resource body from cache
InputStream resource = Cache.getContentStream(url);
if (resource != null) {
contentLength = Cache.getResourceContentLength(url);
} else if (fetchOnline) {
// if the content is not available in cache try to download it from web
// try to download the resource using the loader
final Response entry = load(url, forText, reindexing);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
byte[] resource = Cache.getContent(url);
if (resource != null) return resource;
// in case that the resource was not in ram, read it from disk
if (resourceArray == null) {
resource = Cache.getContentStream(url);
contentLength = Cache.getResourceContentLength(url);
} else {
resource = new ByteArrayInputStream(resourceArray);
contentLength = resourceArray.length;
}
} else {
return null;
}
return new Object[]{resource, Long.valueOf(contentLength)};
if (!fetchOnline) return null;
// try to download the resource using the loader
final Response entry = load(url, forText, reindexing);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
return entry.getContent();
}
/**
@ -332,16 +310,14 @@ public final class LoaderDispatcher {
public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) {
// load resource
long resContentLength = 0;
InputStream resContent = null;
byte[] resContent = null;
ResponseHeader responseHeader = null;
try {
// trying to load the resource from the cache
resContent = Cache.getContentStream(url);
resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url);
if (resContent != null) {
// if the content was found
resContentLength = Cache.getResourceContentLength(url);
} else if (fetchOnline) {
// if not found try to download it
@ -354,11 +330,9 @@ public final class LoaderDispatcher {
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
if (resourceArray != null) {
resContent = new ByteArrayInputStream(resourceArray);
resContentLength = resourceArray.length;
resContent = resourceArray;
} else {
resContent = Cache.getContentStream(url);
resContentLength = Cache.getResourceContentLength(url);
resContent = Cache.getContent(url);
}
}
@ -379,12 +353,12 @@ public final class LoaderDispatcher {
// parse resource
Document document = null;
try {
document = parseDocument(url, resContentLength, resContent, responseHeader);
document = parseDocument(url, resContent.length, new ByteArrayInputStream(resContent), responseHeader);
} catch (final ParserException e) {
Log.logFine("snippet fetch", "parser error " + e.getMessage() + " for url " + url);
return null;
} finally {
try { resContent.close(); } catch (final Exception e) {}
resContent = null;
}
return document;
}

Loading…
Cancel
Save