more generic cache methods

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2721 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 72482b1426
commit 0f10bdde22

@ -57,7 +57,6 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.cache.IResourceInfo;

@ -103,17 +103,19 @@ public final class plasmaHTCache {
public final File cachePath;
public final serverLog log;
public static final HashSet filesInUse = new HashSet(); // can we delete this file
public final boolean useTreeStorage;
public String cacheLayout;
public boolean cacheMigration;
private ResourceInfoFactory objFactory;
private serverThread cacheScanThread;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime, boolean useTreeStorage) {
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime, String cacheLayout, boolean cacheMigration) {
// this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
this.useTreeStorage = useTreeStorage;
this.cacheLayout = cacheLayout;
this.cacheMigration = cacheMigration;
// create the object factory
this.objFactory = new ResourceInfoFactory();
@ -661,23 +663,41 @@ public final class plasmaHTCache {
if (port >= 0) {
fileName.append('!').append(port);
}
File FileTree = new File(this.cachePath, fileName.toString() + path);
// generate cache path according to storage method
if (cacheLayout.equals("tree")) {
File FileTree = treeFile(fileName, path);
if (cacheMigration) {
moveCachedObject(hashFile(fileName, extention, url), FileTree);
}
return FileTree;
}
if (cacheLayout.equals("hash")) {
File FileFlat = hashFile(fileName, extention, url);
if (cacheMigration) {
moveCachedObject(treeFile(fileName, path), FileFlat);
}
return FileFlat;
}
return null;
}
private File treeFile(StringBuffer fileName, String path) {
return new File(this.cachePath, fileName.toString() + path);
}
private File hashFile(StringBuffer fileName, String extention, URL url) {
String urlHash = indexURL.urlHash(url);
String hexHash = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(urlHash));
fileName.append('/').append(hexHash.substring(0,2)).append('/').append(hexHash.substring(2,4)).append('/').append(hexHash);
StringBuffer f = new StringBuffer(18);
f.append('/').append(hexHash.substring(0,2)).append('/').append(hexHash.substring(2,4)).append('/').append(hexHash);
if (extention != null) {
fileName.append(extention);
}
File FileFlat = new File(this.cachePath, fileName.toString());
if (useTreeStorage) {
moveCachedObject(FileFlat, FileTree);
return FileTree;
} else {
moveCachedObject(FileTree, FileFlat);
return FileFlat;
}
return new File(this.cachePath, fileName.toString() + f);
}
/**
* This is a helper funktion that extracts the Hash from the filename
*/
@ -922,7 +942,7 @@ public final class plasmaHTCache {
private String name; // the name of the link, read as anchor from an <a>-tag
private String nomalizedURLHash;
private String nomalizedURLString;
private int status; // cache load/hit/stale etc status
//private int status; // cache load/hit/stale etc status
private Date lastModified;
private char doctype;
private String language;
@ -1013,6 +1033,14 @@ public final class plasmaHTCache {
return this.nomalizedURLHash;
}
public Date lastModified() {
return this.lastModified;
}
public String language() {
return this.language;
}
public plasmaCrawlProfile.entry profile() {
return this.profile;
}

@ -449,8 +449,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
this.log.logInfo("HTCACHE Path = " + htCachePath.getAbsolutePath());
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte
boolean useTreeStorage = getConfigBool("proxyCacheTree", true);
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time, useTreeStorage);
String cacheLayout = getConfig("proxyCacheLayout", "tree");
boolean cacheMigration = getConfigBool("proxyCacheMigration", true);
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time, cacheLayout, cacheMigration);
// make parser
log.logConfig("Starting Parser");

@ -145,19 +145,11 @@ messConfig = httpd.messages
# to enable that function, set proxy=true
proxy=true
# a path to the proxy's file cache.
# a path to the file cache, used for the internal proxy and as crawl buffer
# This will be used if the server is addressed as a proxy
proxyCache = DATA/HTCACHE
# the proxy's maximum disc cache size in megabytes
# there should be enough space for the browsing load of an internet caffee
# running at 56kbit/s modem speed (this time not unusual)
# during 3 days, 8 hours a day
# necessary space = 3 * 8 * 60 * 60 * 56 / 8 = 604800 KB = ca. 590 MB
# since 600 MB is not much these days (it's below one GB!)
# we recommend using that space
#proxyCacheSize = 600
#for testing:
# the maximum disc cache size for files in proxyCache in megabytes
proxyCacheSize = 200
# use the mostly direct mapping of URLs to Filenames
@ -173,7 +165,12 @@ proxyCacheSize = 200
# files that are present under the previously used layout will be renamed
# to the new location and thus be accessible immediately. so an accumulated
# cache is still usable after the switch.
proxyCacheTree = true
# possible values are {tree, hash}
proxyCacheLayout = tree
# the migration flag shows, if the different layout shall be migrated from one to another
proxyCacheMigration = true
# the following mime-types are the whitelist for indexing
#

Loading…
Cancel
Save