From 440c6ee657b3b1b7f314bb37e3b39b95d2b80778 Mon Sep 17 00:00:00 2001 From: hermens Date: Sun, 8 Oct 2006 17:25:19 +0000 Subject: [PATCH] Implement alternative htcache layout mostly according to: http://www.yacy-forum.de/viewtopic.php?p=26205#26205 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2718 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CacheAdmin_p.java | 2 +- source/de/anomic/plasma/plasmaHTCache.java | 136 ++++++++++++++++-- .../de/anomic/plasma/plasmaSwitchboard.java | 3 +- yacy.init | 15 ++ 4 files changed, 144 insertions(+), 12 deletions(-) diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index f3e1041c0..6bfb66834 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -92,7 +92,7 @@ public class CacheAdmin_p { final StringBuffer tree = new StringBuffer(); final StringBuffer info = new StringBuffer(); - final URL url = plasmaHTCache.getURL(switchboard.htCachePath, file); + final URL url = switchboard.cacheManager.getURL(file); String urlstr = ""; if (action.equals("info") && !file.isDirectory()) { diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 17d34d372..f3c640725 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -59,6 +59,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.lang.StringBuffer; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.Date; @@ -71,14 +72,17 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import de.anomic.http.httpc; +import de.anomic.http.httpHeader; import de.anomic.index.indexEntryAttribute; import de.anomic.index.indexURL; +import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; import de.anomic.plasma.cache.ResourceInfoFactory; +import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.server.serverInstantThread; import de.anomic.server.serverSystem; @@ -99,15 +103,17 @@ public final class plasmaHTCache { public final File cachePath; public final serverLog log; public static final HashSet filesInUse = new HashSet(); // can we delete this file + public final boolean useTreeStorage; private ResourceInfoFactory objFactory; private serverThread cacheScanThread; - - public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime) { + + public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb, long preloadTime, boolean useTreeStorage) { // this.switchboard = switchboard; this.log = new serverLog("HTCACHE"); this.cachePath = htCachePath; + this.useTreeStorage = useTreeStorage; // create the object factory this.objFactory = new ResourceInfoFactory(); @@ -345,9 +351,19 @@ public final class plasmaHTCache { if (deleteFileandDirs (obj, "OLD")) { try { // As the file is gone, the entry in responseHeader.db is not needed anymore - this.log.logFinest("Trying to remove responseHeader for URL: " + - getURL(this.cachePath ,obj).toString()); - this.responseHeaderDB.remove(indexURL.urlHash(getURL(this.cachePath ,obj))); + String urlHash = getHash(obj); + if (urlHash != null) { + this.log.logFinest("Trying to remove responseHeader for URLhash: " + + urlHash); + this.responseHeaderDB.remove(urlHash); + } else { + URL url = getURL(obj); + if (url != null) { + this.log.logFinest("Trying to remove responseHeader for URL: " + + url.toString()); + this.responseHeaderDB.remove(indexURL.urlHash(url)); + } + } } catch (IOException e) { this.log.logInfo("IOExeption removing response header from DB: " + e.getMessage(), e); @@ -394,6 +410,7 @@ public final class plasmaHTCache { kelondroMScoreCluster doms = new kelondroMScoreCluster(); int fileCount = 0; enumerateFiles fileEnum = new enumerateFiles(this.cachePath, true, false, true, true); + File dbfile = new File(this.cachePath, "responseHeader.db"); while (fileEnum.hasMoreElements()) { if (Thread.currentThread().isInterrupted()) return; fileCount++; @@ -402,7 +419,7 @@ public final class plasmaHTCache { //System.out.println("Cache: " + dom(f)); doms.incScore(dom(nextFile)); this.curCacheSize += nextFile.length(); - this.cacheAge.put(ageString(nextFileModDate, nextFile), nextFile); + if (!dbfile.equals(nextFile)) this.cacheAge.put(ageString(nextFileModDate, nextFile), nextFile); try { Thread.sleep(10); } catch (InterruptedException e) { @@ -542,6 +559,29 @@ public final class plasmaHTCache { return plasmaParser.mediaExtContains(urlString); } + /* + * This function moves an old cached object (if it exists) to the new position + */ + private void moveCachedObject(File oldpath, File newpath) { + try { + if (oldpath.exists() && oldpath.isFile() && (!newpath.exists())) { + long d = oldpath.lastModified(); + newpath.getParentFile().mkdirs(); + if (oldpath.renameTo(newpath)) { + cacheAge.put(ageString(d, newpath), newpath); + File obj = oldpath.getParentFile(); + while ((!(obj.equals(this.cachePath))) && (obj.isDirectory()) && (obj.list().length == 0)) { + if (obj.delete()) this.log.logFine("DELETED EMPTY DIRECTORY : " + obj.toString()); + obj = obj.getParentFile(); + } + } + } + } catch (Exception e) { + log.logFine("moveCachedObject('" + oldpath.toString() + "','" + + newpath.toString() + "')", e); + } + } + private String replaceRegex(String input, String regex, String replacement) { if (input == null) { return ""; } if (input.length() > 0) { @@ -583,6 +623,14 @@ public final class plasmaHTCache { // yes this is not reversible, but that is not needed path = replaceRegex(path, "/\\.\\./", "/!!/"); path = replaceRegex(path, "(\"|\\\\|\\*|\\?|:|<|>|\\|+)", "_"); // hier wird kein '/' gefiltert + String extention = null; + int d = path.lastIndexOf("."); + int s = path.lastIndexOf("/"); + if ((d >= 0) && (d > s)) { + extention = path.substring(d); + } else if (path.endsWith("/ndx")) { + extention = new String (".html"); // Just a wild guess + } path = path.concat(replaceRegex(query, "(\"|\\\\|\\*|\\?|/|:|<|>|\\|+)", "_")); // only set NO default ports @@ -608,18 +656,86 @@ public final class plasmaHTCache { } else { host = "other/" + host; } - if (port < 0) { - return new File(this.cachePath, protocol + "/" + host + path); + StringBuffer fileName = new StringBuffer(); + fileName.append(protocol).append('/').append(host); + if (port >= 0) { + fileName.append('!').append(port); + } + File FileTree = new File(this.cachePath, fileName.toString() + path); + String urlHash = indexURL.urlHash(url); + String hexHash = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(urlHash)); + fileName.append('/').append(hexHash.substring(0,2)).append('/').append(hexHash.substring(2,4)).append('/').append(hexHash); + if (extention != null) { + fileName.append(extention); + } + File FileFlat = new File(this.cachePath, fileName.toString()); + if (useTreeStorage) { + moveCachedObject(FileFlat, FileTree); + return FileTree; + } else { + moveCachedObject(FileTree, FileFlat); + return FileFlat; } - return new File(this.cachePath, protocol + "/" + host + "!" + port + path); + } + + /** + * This is a helper funktion that extracts the Hash from the filename + */ + public static String getHash(final File f) { + String hexHash, hash; + try { + hexHash = f.getName().substring(0,18); + hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.decodeHex(hexHash)); + } catch (Exception e) { + //log.logWarning("getHash: " + e.getMessage(), e); + return null; + } + if (hash.length() == indexURL.urlHashLength) return hash; + else return null; } /** * this is the reverse function to getCachePath: it constructs the url as string * from a given storage path */ - public static URL getURL(final File cachePath, final File f) { + public URL getURL(final File f) { // this.log.logFinest("plasmaHTCache: getURL: IN: Path=[" + cachePath + "] File=[" + f + "]"); + final String urlHash = getHash(f); + if (urlHash != null) { + URL url = null; + // try the urlPool + try { + url = plasmaSwitchboard.getSwitchboard().urlPool.getURL(urlHash); + } catch (Exception e) { + log.logWarning("getURL(" + urlHash + "): " /*+ e.getMessage()*/, e); + url = null; + } + if (url != null) return url; + // try responseHeaderDB + Map hdb; + try { + hdb = this.responseHeaderDB.get(urlHash); + } catch (IOException e) { + hdb = null; + } + if (hdb != null) { + Object origRequestLine = hdb.get(httpHeader.X_YACY_ORIGINAL_REQUEST_LINE); + if ((origRequestLine != null)&&(origRequestLine instanceof String)) { + int i = ((String)origRequestLine).indexOf(" "); + if (i >= 0) { + String s = ((String)origRequestLine).substring(i).trim(); + i = s.indexOf(" "); + try { + url = new URL((i<0) ? s : s.substring(0,i)); + } catch (final Exception e) { + url = null; + } + } + } + } + if (url != null) return url; + } + // If we can't get the correct URL, it seems to be a treeed file final String c = cachePath.toString().replace('\\', '/'); String path = f.toString().replace('\\', '/'); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index f4cace38a..fa9b9005d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -449,7 +449,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } this.log.logInfo("HTCACHE Path = " + htCachePath.getAbsolutePath()); long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte - this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time); + boolean useTreeStorage = getConfigBool("proxyCacheTree", true); + this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP, ramHTTP_time, useTreeStorage); // make parser log.logConfig("Starting Parser"); diff --git a/yacy.init b/yacy.init index fb2432284..a0d9ebdf9 100644 --- a/yacy.init +++ b/yacy.init @@ -160,6 +160,21 @@ proxyCache = DATA/HTCACHE #for testing: proxyCacheSize = 200 +# use the mostly direct mapping of URLs to Filenames +# makes it easy watching the content of the cache using file browsers +# problems arise when a file already exists where a new entry expects a directory +# or vice versa. +# when set to false, the file names are set to the hash of the URL and the +# directory is build from protokoll, hostname and port, as with the old +# layout. +# the advantage of this scheme is that no directory/file collisions can +# occurr. +# switching this flag will take effect after a restart of yacy. +# files that are present under the previously used layout will be renamed +# to the new location and thus be accessible immediately. so an accumulated +# cache is still usable after the switch. +proxyCacheTree = true + # the following mime-types are the whitelist for indexing # # parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly