diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 6bfb66834..526a104c7 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -57,7 +57,6 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.net.URL; -import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.cache.IResourceInfo; diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index f3c640725..df9c4c042 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -103,17 +103,19 @@ public final class plasmaHTCache { public final File cachePath; public final serverLog log; public static final HashSet filesInUse = new HashSet(); // can we delete this file - public final boolean useTreeStorage; + public String cacheLayout; + public boolean cacheMigration; private ResourceInfoFactory objFactory; private serverThread cacheScanThread; - public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime, boolean useTreeStorage) { + public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime, String cacheLayout, boolean cacheMigration) { // this.switchboard = switchboard; this.log = new serverLog("HTCACHE"); this.cachePath = htCachePath; - this.useTreeStorage = useTreeStorage; + this.cacheLayout = cacheLayout; + this.cacheMigration = cacheMigration; // create the object factory this.objFactory = new ResourceInfoFactory(); @@ -661,23 +663,41 @@ public final class plasmaHTCache { if (port >= 0) { fileName.append('!').append(port); } - File FileTree = new File(this.cachePath, fileName.toString() + path); + + // generate cache path according to storage method + if (cacheLayout.equals("tree")) { + File FileTree = treeFile(fileName, path); + if (cacheMigration) { + moveCachedObject(hashFile(fileName, extention, url), FileTree); + } + return FileTree; + } + if (cacheLayout.equals("hash")) { + File FileFlat = hashFile(fileName, extention, url); + if (cacheMigration) { + moveCachedObject(treeFile(fileName, path), FileFlat); + } + return FileFlat; + } + return null; + } + + private File treeFile(StringBuffer fileName, String path) { + return new File(this.cachePath, fileName.toString() + path); + } + + private File hashFile(StringBuffer fileName, String extention, URL url) { String urlHash = indexURL.urlHash(url); String hexHash = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(urlHash)); - fileName.append('/').append(hexHash.substring(0,2)).append('/').append(hexHash.substring(2,4)).append('/').append(hexHash); + StringBuffer f = new StringBuffer(18); + f.append('/').append(hexHash.substring(0,2)).append('/').append(hexHash.substring(2,4)).append('/').append(hexHash); if (extention != null) { fileName.append(extention); } - File FileFlat = new File(this.cachePath, fileName.toString()); - if (useTreeStorage) { - moveCachedObject(FileFlat, FileTree); - return FileTree; - } else { - moveCachedObject(FileTree, FileFlat); - return FileFlat; - } + return new File(this.cachePath, fileName.toString() + f); } - + + /** * This is a helper funktion that extracts the Hash from the filename */ @@ -922,7 +942,7 @@ public final class plasmaHTCache { private String name; // the name of the link, read as anchor from an -tag private String nomalizedURLHash; private String nomalizedURLString; - private int status; // cache load/hit/stale etc status + //private int status; // cache load/hit/stale etc status private Date lastModified; private char doctype; private String language; @@ -1013,6 +1033,14 @@ public final class plasmaHTCache { return this.nomalizedURLHash; } + public Date lastModified() { + return this.lastModified; + } + + public String language() { + return this.language; + } + public plasmaCrawlProfile.entry profile() { return this.profile; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index fa9b9005d..e64c00f61 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -449,8 +449,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } this.log.logInfo("HTCACHE Path = " + htCachePath.getAbsolutePath()); long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte - boolean useTreeStorage = getConfigBool("proxyCacheTree", true); - this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time, useTreeStorage); + String cacheLayout = getConfig("proxyCacheLayout", "tree"); + boolean cacheMigration = getConfigBool("proxyCacheMigration", true); + this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time, cacheLayout, cacheMigration); // make parser log.logConfig("Starting Parser"); diff --git a/yacy.init b/yacy.init index a0d9ebdf9..6033bb0ba 100644 --- a/yacy.init +++ b/yacy.init @@ -145,19 +145,11 @@ messConfig = httpd.messages # to enable that function, set proxy=true proxy=true -# a path to the proxy's file cache. +# a path to the file cache, used for the internal proxy and as crawl buffer # This will be used if the server is addressed as a proxy proxyCache = DATA/HTCACHE -# the proxy's maximum disc cache size in megabytes -# there should be enough space for the browsing load of an internet caffee -# running at 56kbit/s modem speed (this time not unusual) -# during 3 days, 8 hours a day -# necessary space = 3 * 8 * 60 * 60 * 56 / 8 = 604800 KB = ca. 590 MB -# since 600 MB is not much these days (it's below one GB!) -# we recommend using that space -#proxyCacheSize = 600 -#for testing: +# the maximum disc cache size for files in proxyCache in megabytes proxyCacheSize = 200 # use the mostly direct mapping of URLs to Filenames @@ -173,7 +165,12 @@ proxyCacheSize = 200 # files that are present under the previously used layout will be renamed # to the new location and thus be accessible immediately. so an accumulated # cache is still usable after the switch. -proxyCacheTree = true +# possible values are {tree, hash} +proxyCacheLayout = tree + +# the migration flag shows, if the different layout shall be migrated from one to another +proxyCacheMigration = true + # the following mime-types are the whitelist for indexing #