enhancements to web cache and less strict caching rules

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7620 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent f3baaca920
commit b1a8d0c020

@ -740,7 +740,7 @@ search.navigation=hosts,authors,namespace,topics
# consider content nevertheless as available and show result without snippet
# false: no link verification and not snippet generation:
all search results are valid without verification
search.verify = iffresh
search.verify = ifexist
# in case that a link verification fails then the corresponding index reference can be
# deleted to clean up the index. If this property is set then failed index verification in

@ -54,7 +54,7 @@ public class CacheResource_p {
}
byte[] resource = null;
resource = Cache.getContent(url);
resource = Cache.getContent(url.hash());
if (resource == null) return prop;
// check request type
@ -63,7 +63,7 @@ public class CacheResource_p {
return ImageParser.parse(u, resource);
} else {
// get response header and set mime type
ResponseHeader responseHeader = Cache.getResponseHeader(url);
ResponseHeader responseHeader = Cache.getResponseHeader(url.hash());
String resMime = responseHeader == null ? null : responseHeader.mime();
if (resMime != null) {
final ResponseHeader outgoingHeader = new ResponseHeader();

@ -56,7 +56,7 @@ public class getpageinfo_p {
}
ContentScraper scraper = null;
if (u != null) try {
scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFFRESH);
scraper = sb.loader.parseResource(u, CrawlProfile.CacheStrategy.IFEXIST);
} catch (final IOException e) {
// now thats a fail, do nothing
}

@ -159,12 +159,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
}
public CacheStrategy cacheStrategy() {
final String r = get(CACHE_STRAGEGY);
if (r == null) return CacheStrategy.IFFRESH;
if (r == null) return CacheStrategy.IFEXIST;
try {
return CacheStrategy.decode(Integer.parseInt(r));
} catch (final NumberFormatException e) {
Log.logException(e);
return CacheStrategy.IFFRESH;
return CacheStrategy.IFEXIST;
}
}
public void setCacheStrategy(CacheStrategy newStrategy) {
@ -260,7 +260,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
if (name.equals("true")) return IFFRESH;
if (name.equals("true")) return IFEXIST;
if (name.equals("false")) return null; // if this cache strategy is assigned as query attribute, null means "do not create a snippet"
return null;
}

@ -225,7 +225,7 @@ public final class CrawlSwitchboard {
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {

@ -202,11 +202,11 @@ public final class Cache {
* @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
* info object couldn't be created
*/
public static ResponseHeader getResponseHeader(final DigestURI url) {
public static ResponseHeader getResponseHeader(final byte[] hash) {
// loading data from database
Map<String, String> hdb;
hdb = responseHeaderDB.get(url.hash());
hdb = responseHeaderDB.get(hash);
if (hdb == null) return null;
return new ResponseHeader(null, hdb);
@ -221,12 +221,11 @@ public final class Cache {
* is returned.
* @throws IOException
*/
public static byte[] getContent(final DigestURI url) {
public static byte[] getContent(final byte[] hash) {
// load the url as resource from the cache
try {
byte[] b = fileDB.get(url.hash());
byte[] b = fileDB.get(hash);
if (b == null) return null;
log.logInfo("cache hit for url " + url.toString() + ", " + b.length + " bytes");
return b;
} catch (UnsupportedEncodingException e) {
Log.logException(e);

@ -363,7 +363,7 @@ public final class HTTPDProxyHandler {
// handle outgoing cookies
handleOutgoingCookies(requestHeader, host, ip);
prepareRequestHeader(conProp, requestHeader, hostlow);
ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url);
ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
// why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises
@ -409,7 +409,7 @@ public final class HTTPDProxyHandler {
"200 OK",
sb.crawler.defaultProxyProfile
);
byte[] cacheContent = Cache.getContent(url);
byte[] cacheContent = Cache.getContent(url.hash());
if (cacheContent != null && response.isFreshForProxy()) {
if (log.isFinest()) log.logFinest(reqID + " fulfill request from cache");
fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
@ -500,9 +500,9 @@ public final class HTTPDProxyHandler {
long sizeBeforeDelete = -1;
if (cachedResponseHeader != null) {
// delete the cache
ResponseHeader rh = Cache.getResponseHeader(url);
ResponseHeader rh = Cache.getResponseHeader(url.hash());
if (rh != null && (sizeBeforeDelete = rh.getContentLength()) == 0) {
byte[] b = Cache.getContent(url);
byte[] b = Cache.getContent(url.hash());
if (b != null) sizeBeforeDelete = b.length;
}
Cache.delete(url);

@ -1755,7 +1755,7 @@ public final class Switchboard extends serverSwitch {
byte[] b = response.getContent();
if (b == null) {
// fetch the document from cache
b = Cache.getContent(response.url());
b = Cache.getContent(response.url().hash());
if (b == null) {
this.log.logWarning("the resource '" + response.url() + "' is missing in the cache.");
addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), "missing in cache");

@ -107,7 +107,7 @@ public class OSMTile {
return null;
}
//System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
byte[] tileb = Cache.getContent(tileURL);
byte[] tileb = Cache.getContent(tileURL.hash());
if (tileb == null) {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;

@ -440,6 +440,7 @@ public class HeapReader {
* @throws IOException
*/
public byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
if (this.index == null) return null;
key = normalizeKey(key);
synchronized (this.index) {

@ -191,8 +191,8 @@ public final class LoaderDispatcher {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url);
byte[] content = (cachedResponse == null) ? null : Cache.getContent(url);
ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
if (cachedResponse != null && content != null) {
// yes we have the content
@ -226,6 +226,10 @@ public final class LoaderDispatcher {
} else {
log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
}
} else if (cachedResponse != null) {
log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
} else if (content != null) {
log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
}
}

Loading…
Cancel
Save