From b1a8d0c020d958895a930fd340adf8c1b391dc90 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 22 Mar 2011 10:35:26 +0000 Subject: [PATCH] enhancements to web cache and less strict caching rules git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7620 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 2 +- htroot/CacheResource_p.java | 4 ++-- htroot/api/util/getpageinfo_p.java | 2 +- source/de/anomic/crawler/CrawlProfile.java | 6 +++--- source/de/anomic/crawler/CrawlSwitchboard.java | 2 +- source/de/anomic/http/client/Cache.java | 9 ++++----- source/de/anomic/http/server/HTTPDProxyHandler.java | 8 ++++---- source/de/anomic/search/Switchboard.java | 2 +- source/de/anomic/yacy/graphics/OSMTile.java | 2 +- source/net/yacy/kelondro/blob/HeapReader.java | 1 + source/net/yacy/repository/LoaderDispatcher.java | 8 ++++++-- 11 files changed, 25 insertions(+), 21 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 48e3b626e..1f054a946 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -740,7 +740,7 @@ search.navigation=hosts,authors,namespace,topics # consider content nevertheless as available and show result without snippet # false: no link verification and not snippet generation: all search results are valid without verification -search.verify = iffresh +search.verify = ifexist # in case that a link verification fails then the corresponding index reference can be # deleted to clean up the index. If this property is set then failed index verification in diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java index 3d29ab834..1007cd4e9 100644 --- a/htroot/CacheResource_p.java +++ b/htroot/CacheResource_p.java @@ -54,7 +54,7 @@ public class CacheResource_p { } byte[] resource = null; - resource = Cache.getContent(url); + resource = Cache.getContent(url.hash()); if (resource == null) return prop; // check request type @@ -63,7 +63,7 @@ public class CacheResource_p { return ImageParser.parse(u, resource); } else { // get response header and set mime type - ResponseHeader responseHeader = Cache.getResponseHeader(url); + ResponseHeader responseHeader = Cache.getResponseHeader(url.hash()); String resMime = responseHeader == null ? null : responseHeader.mime(); if (resMime != null) { final ResponseHeader outgoingHeader = new ResponseHeader(); diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 137066838..026769b9c 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -56,7 +56,7 @@ public class getpageinfo_p { } ContentScraper scraper = null; if (u != null) try { - scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH); + scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFEXIST); } catch (final IOException e) { // now thats a fail, do nothing } diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 19c2b519b..5deb25560 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -159,12 +159,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M } public CacheStrategy cacheStrategy() { final String r = get(CACHE_STRAGEGY); - if (r == null) return CacheStrategy.IFFRESH; + if (r == null) return CacheStrategy.IFEXIST; try { return CacheStrategy.decode(Integer.parseInt(r)); } catch (final NumberFormatException e) { Log.logException(e); - return CacheStrategy.IFFRESH; + return CacheStrategy.IFEXIST; } } public void setCacheStrategy(CacheStrategy newStrategy) { @@ -260,7 +260,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (name.equals("iffresh")) return IFFRESH; if (name.equals("ifexist")) return IFEXIST; if (name.equals("cacheonly")) return CACHEONLY; - if (name.equals("true")) return IFFRESH; + if (name.equals("true")) return IFEXIST; if (name.equals("false")) return null; // if this cache strategy is assigned as query attribute, null means "do not create a snippet" return null; } diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 9090d1416..00f49e66d 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -225,7 +225,7 @@ public final class CrawlSwitchboard { if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index 4f1f887e4..ddcf81db8 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -202,11 +202,11 @@ public final class Cache { * @throws UnsupportedProtocolException if the protocol is not supported and therefore the * info object couldn't be created */ - public static ResponseHeader getResponseHeader(final DigestURI url) { + public static ResponseHeader getResponseHeader(final byte[] hash) { // loading data from database Map hdb; - hdb = responseHeaderDB.get(url.hash()); + hdb = responseHeaderDB.get(hash); if (hdb == null) return null; return new ResponseHeader(null, hdb); @@ -221,12 +221,11 @@ public final class Cache { * is returned. * @throws IOException */ - public static byte[] getContent(final DigestURI url) { + public static byte[] getContent(final byte[] hash) { // load the url as resource from the cache try { - byte[] b = fileDB.get(url.hash()); + byte[] b = fileDB.get(hash); if (b == null) return null; - log.logInfo("cache hit for url " + url.toString() + ", " + b.length + " bytes"); return b; } catch (UnsupportedEncodingException e) { Log.logException(e); diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 96e1804f2..04975cc17 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -363,7 +363,7 @@ public final class HTTPDProxyHandler { // handle outgoing cookies handleOutgoingCookies(requestHeader, host, ip); prepareRequestHeader(conProp, requestHeader, hostlow); - ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url); + ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash()); // why are files unzipped upon arrival? why not zip all files in cache? // This follows from the following premises @@ -409,7 +409,7 @@ public final class HTTPDProxyHandler { "200 OK", sb.crawler.defaultProxyProfile ); - byte[] cacheContent = Cache.getContent(url); + byte[] cacheContent = Cache.getContent(url.hash()); if (cacheContent != null && response.isFreshForProxy()) { if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache"); fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond); @@ -500,9 +500,9 @@ public final class HTTPDProxyHandler { long sizeBeforeDelete = -1; if (cachedResponseHeader != null) { // delete the cache - ResponseHeader rh = Cache.getResponseHeader(url); + ResponseHeader rh = Cache.getResponseHeader(url.hash()); if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) { - byte[] b = Cache.getContent(url); + byte[] b = Cache.getContent(url.hash()); if (b != null) sizeBeforeDelete = b.length; } Cache.delete(url); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 467a46d97..54ae7a7da 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1755,7 +1755,7 @@ public final class Switchboard extends serverSwitch { byte[] b = response.getContent(); if (b == null) { // fetch the document from cache - b = Cache.getContent(response.url()); + b = Cache.getContent(response.url().hash()); if (b == null) { this.log.logWarning("the resource '" + response.url() + "' is missing in the cache."); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache"); diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java index 5568736da..ee054f82a 100644 --- a/source/de/anomic/yacy/graphics/OSMTile.java +++ b/source/de/anomic/yacy/graphics/OSMTile.java @@ -107,7 +107,7 @@ public class OSMTile { return null; } //System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true)); - byte[] tileb = Cache.getContent(tileURL); + byte[] tileb = Cache.getContent(tileURL.hash()); if (tileb == null) { // download resource using the crawler and keep resource in memory if possible Response entry = null; diff --git a/source/net/yacy/kelondro/blob/HeapReader.java b/source/net/yacy/kelondro/blob/HeapReader.java index fa67aca3b..d0f25ccc2 100644 --- a/source/net/yacy/kelondro/blob/HeapReader.java +++ b/source/net/yacy/kelondro/blob/HeapReader.java @@ -440,6 +440,7 @@ public class HeapReader { * @throws IOException */ public byte[] get(byte[] key) throws IOException, RowSpaceExceededException { + if (this.index == null) return null; key = normalizeKey(key); synchronized (this.index) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index d16665ef7..b2082a256 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -191,8 +191,8 @@ public final class LoaderDispatcher { // we have passed a first test if caching is allowed // now see if there is a cache entry - ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url); - byte[] content = (cachedResponse == null) ? null : Cache.getContent(url); + ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash()); + byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash()); if (cachedResponse != null && content != null) { // yes we have the content @@ -226,6 +226,10 @@ public final class LoaderDispatcher { } else { log.logInfo("cache hit/stale for: " + url.toNormalform(true, false)); } + } else if (cachedResponse != null) { + log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false)); + } else if (content != null) { + log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false)); } }