diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 98ffc74d0..10491a795 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -50,7 +50,6 @@ import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.Map; - import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.http.httpHeader; @@ -67,18 +66,20 @@ public class CacheAdmin_p { private static SimpleDateFormat SimpleFormatter = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss"); public static String dateString(Date date) { - return SimpleFormatter.format(date); + return SimpleFormatter.format(date); } - public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { plasmaSwitchboard switchboard = (plasmaSwitchboard) env; serverObjects prop = new serverObjects(); String action = ((post == null) ? "info" : post.get("action", "info")); - String pathString = ((post == null) ? "" : post.get("path", "/")); + String pathString = ((post == null) ? "" : post.get("path", "/")); String fileString = pathString; - File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE")); + + // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300 + File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString()); + File file = new File(cache, pathString); File dir; URL url = plasmaHTCache.getURL(cache, file); @@ -89,6 +90,7 @@ public class CacheAdmin_p { dir = file.getParentFile(); pathString = (new File(pathString)).getParent().replace('\\','/'); } + // generate dir listing String[] list = dir.list(); File f; String tree = "Directory of
" + ((pathString.length() == 0) ? "domain list" : linkPathString(pathString)) + "

"; @@ -97,18 +99,19 @@ public class CacheAdmin_p { else { for (int i = 0; i < list.length; i++) { f = new File(dir, list[i]); - if (f.isDirectory()) - tree += "\"Folder\" " + list[i] + "
" + serverCore.crlfString; - else - tree += "\"File\" " + list[i] + "
" + serverCore.crlfString; + if (!f.getName().equalsIgnoreCase("responseHeader.db")) + if (f.isDirectory()) + tree += "\"Folder\" " + list[i] + "
" + serverCore.crlfString; + else + tree += "\"File\" " + list[i] + "
" + serverCore.crlfString; } } - + String info = ""; if (action.equals("info")) { if (!(file.isDirectory())) { - String urls = htmlFilterContentScraper.urlNormalform(url); + String urls = htmlFilterContentScraper.urlNormalform(url); info += "Info for URL " + urls + ":

"; try { httpHeader fileheader = switchboard.cacheManager.getCachedResponse(plasmaURL.urlHash(url)); @@ -140,14 +143,13 @@ public class CacheAdmin_p { } } } - - // - prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024)); - prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024)); + + prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024)); + prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024)); prop.put("tree", tree); prop.put("info", info); // return rewrite properties - return prop; + return prop; } private static String formatHeader(httpHeader header) { @@ -193,5 +195,4 @@ public class CacheAdmin_p { } return result; } - } diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java index 2bb9b0797..6f7660fe1 100644 --- a/htroot/CacheResource_p.java +++ b/htroot/CacheResource_p.java @@ -41,7 +41,7 @@ // You must compile this file with -// javac -classpath .:../Classes Message.java +// javac -classpath .:../classes CacheResource_p.java // if the shell's current path is HTROOT import java.io.File; @@ -56,11 +56,14 @@ import de.anomic.server.serverSwitch; public class CacheResource_p { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - plasmaSwitchboard switchboard = (plasmaSwitchboard) env; - serverObjects prop = new serverObjects(); + plasmaSwitchboard switchboard = (plasmaSwitchboard) env; + serverObjects prop = new serverObjects(); + + String path = ((post == null) ? "" : post.get("path", "")); + + // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300 + File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString()); - String path = ((post == null) ? "" : post.get("path", "")); - File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE")); File f = new File(cache, path); byte[] resource; @@ -70,7 +73,6 @@ public class CacheResource_p { } catch (IOException e) { prop.put("resource", new byte[0]); } - return prop; + return prop; } - } diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html index 3b3140b48..ae495660c 100644 --- a/htroot/ProxyIndexingMonitor_p.html +++ b/htroot/ProxyIndexingMonitor_p.html @@ -18,12 +18,16 @@ and automatically excluded from indexing.

-
Proxy pre-fetch setting: -this is an automated html page loading procedure that takes actual proxy-requested -URLs as crawling start points for crawling.
+ + + + + - + - + + + + + + + + + + + + + + + + + - - - + +
Proxy pre-fetch setting: +this is an automated html page loading procedure that takes actual proxy-requested +URLs as crawling start points for crawling.
Prefetch Depth:Prefetch Depth A prefetch of 0 means no prefetch; a prefetch of 1 means to prefetch all @@ -31,29 +35,52 @@ URLs as crawling start points for crawling. this means that only embedded href-anchors are prefetched additionally.
Store to Cache:Store to Cache It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.
Proxy generally
PathThe path where the pages are stored (max. length 80)
SizeThe size in MB of the cache.
 

+

#(info)# + :: -
The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted. + +The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted. Please delete that file and restart.
:: -
-Proxy pre-fetch is now set to depth-#[message]#.
-Proxy caching is now set #(caching)#off::on#(/caching)#.
+ +Pre-fetch is now set to depth-#[message]#.
+Caching is now #(caching)#off::on#(/caching)#.
+#(path)#::Cachepath is now set to '#[return]#'. Please move the old data in the new directory.
#(/path)# +#(size)#::Cachesize is now set to #[return]#MB.
#(/size)# +#(restart)#::
Changes will take effect after restart only.
#(/restart)# :: -
An error has occurred: #[error]#.
+ +An error has occurred: #[error]#.
#(/info)# +

You can see a snapshot of recently indexed pages on the Proxy Index Monitor Page. diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java index 596e72505..0aaf83d61 100644 --- a/htroot/ProxyIndexingMonitor_p.java +++ b/htroot/ProxyIndexingMonitor_p.java @@ -40,13 +40,14 @@ // Contributions and changes to the program code must be marked as such. // You must compile this file with -// javac -classpath .:../Classes Settings_p.java +// javac -classpath .:../classes ProxyIndexingMonitor_p.java // if the shell's current path is HTROOT +import java.io.File; import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Locale; +// import java.text.SimpleDateFormat; +// import java.util.Date; +// import java.util.Locale; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlProfile; @@ -54,28 +55,30 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyCore; public class ProxyIndexingMonitor_p { - private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); - private static String daydate(Date date) { - if (date == null) return ""; else return dayFormatter.format(date); - } - +// private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US); +// private static String daydate(Date date) { +// if (date == null) return ""; else return dayFormatter.format(date); +// } + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { - // return variable that accumulates replacements + // return variable that accumulates replacements plasmaSwitchboard switchboard = (plasmaSwitchboard) env; - serverObjects prop = new serverObjects(); - - int showIndexedCount = 20; - boolean se = false; - + serverObjects prop = new serverObjects(); + +// int showIndexedCount = 20; +// boolean se = false; + + String oldProxyCache, newProxyCache; + String oldProxyCacheSize, newProxyCacheSize; + prop.put("info", 0); prop.put("info_message", ""); - + if (post != null) { - + if (post.containsKey("proxyprofileset")) try { // read values and put them in global settings int newProxyPrefetchDepth = Integer.parseInt((String) post.get("proxyPrefetchDepth", "0")); @@ -83,34 +86,73 @@ public class ProxyIndexingMonitor_p { boolean proxyStoreHTCache = ((String) post.get("proxyStoreHTCache", "")).equals("on"); env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false"); + // added proxyCache, proxyCacheSize - Borg-0300 + // proxyCache - check and create the directory + oldProxyCache = env.getConfig("proxyCache", "DATA/HTCACHE"); + newProxyCache = ((String) post.get("proxyCache", "DATA/HTCACHE")); + newProxyCache = newProxyCache.replace("\\", "/"); + if (newProxyCache.endsWith("/")) newProxyCache.substring(0, newProxyCache.length() - 1); + File cp = new File(newProxyCache); + if ((!cp.isDirectory()) && (!cp.isFile())) cp.mkdirs(); + env.setConfig("proxyCache", newProxyCache); + + // proxyCacheSize + oldProxyCacheSize = Integer.toString(Integer.parseInt(env.getConfig("proxyCacheSize", "64"))); + newProxyCacheSize = Integer.toString(Integer.parseInt((String) post.get("proxyCacheSize", "64"))); + env.setConfig("proxyCacheSize", newProxyCacheSize); + // implant these settings also into the crawling profile for the proxy plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(switchboard.getConfig("defaultProxyProfile", "")); if (profile == null) { - prop.put("info", 1);//delete DATA/PLASMADB/crawlProfiles0.db + prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db } else { try { - profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); + profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth)); profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false"); - prop.put("info", 2);//new proxyPrefetchdepth - prop.put("info_message", newProxyPrefetchDepth); + prop.put("info", 2);//new proxyPrefetchdepth + prop.put("info_message", newProxyPrefetchDepth); prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0); + + // proxyCache - only display on change + if (oldProxyCache.compareTo(newProxyCache) == 0) { + prop.put("info_path", 0); + prop.put("info_path_return", oldProxyCache); + } else { + prop.put("info_path", 1); + prop.put("info_path_return", newProxyCache); + } + // proxyCacheSize - only display on change + if (oldProxyCacheSize.compareTo(newProxyCacheSize) == 0) { + prop.put("info_size", 0); + prop.put("info_size_return", oldProxyCacheSize); + } else { + prop.put("info_size", 1); + prop.put("info_size_return", newProxyCacheSize); + } + // proxyCache, proxyCacheSize we need a restart + prop.put("info_restart", 0); + prop.put("info_restart_return", 0); + if (oldProxyCache.compareTo(newProxyCache) != 0) prop.put("info_restart", 1); + if (oldProxyCacheSize.compareTo(newProxyCacheSize) != 0) prop.put("info_restart", 1); + } catch (IOException e) { - prop.put("info", 3); //Error: errmsg - prop.put("info_error", e.getMessage()); + prop.put("info", 3); //Error: errmsg + prop.put("info_error", e.getMessage()); } } - + } catch (Exception e) { prop.put("info", 2); //Error: errmsg prop.put("info_error", e.getMessage()); serverLog.logError("SERVLET", "ProxyIndexingMonitor.case3", e); } } - + prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0); + prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE")); + prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64")); // return rewrite properties - return prop; + return prop; } - } diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 1d4f0598b..883ef253a 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -113,8 +113,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen int p; if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p); if (us.endsWith(":80")) us = us.substring(0, us.length() - 3); - p = us.indexOf(":80/"); - if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); + if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3)); if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1); return us; } diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 15c431b08..11cca0fb0 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -59,9 +59,9 @@ import java.util.Date; import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; -import java.util.Calendar; -import java.util.GregorianCalendar; -import java.util.TimeZone; +//import java.util.Calendar; +//import java.util.GregorianCalendar; +//import java.util.TimeZone; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; @@ -79,7 +79,7 @@ public final class plasmaHTCache { private static final int stackLimit = 150; // if we exceed that limit, we do not check idle public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day - + private kelondroMap responseHeaderDB = null; private final LinkedList cacheStack; private final TreeMap cacheAge; // a - relation @@ -89,67 +89,68 @@ public final class plasmaHTCache { public static serverLog log; public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) { - //this.switchboard = switchboard; - + // this.switchboard = switchboard; + this.log = new serverLog("HTCACHE"); this.cachePath = htCachePath; this.maxCacheSize = maxCacheSize; - - // set cache path - if (!(htCachePath.exists())) { - // make the cache path - htCachePath.mkdir(); - } - if (!(htCachePath.isDirectory())) { - // if the cache does not exists or is a file and not a directory, panic - System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); - System.exit(0); - } - // open the response header database - File dbfile = new File(cachePath, "responseHeader.db"); - try { + // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300 +/* // set cache path + if (!(htCachePath.exists())) { + // make the cache path + htCachePath.mkdir(); + } + if (!(htCachePath.isDirectory())) { + // if the cache does not exists or is a file and not a directory, panic + System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); + System.exit(0); + }*/ + + // open the response header database + File dbfile = new File(cachePath, "responseHeader.db"); + try { if (dbfile.exists()) - responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400)); - else - responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150)); - } catch (IOException e) { - System.out.println("the request header database could not be opened: " + e.getMessage()); - System.exit(0); - } + responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400)); + else + responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150)); + } catch (IOException e) { + System.out.println("the request header database could not be opened: " + e.getMessage()); + System.exit(0); + } - // init stack - cacheStack = new LinkedList(); + // init stack + cacheStack = new LinkedList(); // init cache age and size management - cacheAge = new TreeMap(); - currCacheSize = 0; - this.maxCacheSize = maxCacheSize; - - // start the cache startup thread - // this will collect information about the current cache size and elements - serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000); + cacheAge = new TreeMap(); + currCacheSize = 0; + this.maxCacheSize = maxCacheSize; + + // start the cache startup thread + // this will collect information about the current cache size and elements + serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000); } - + public int size() { return cacheStack.size(); } - + public void push(Entry entry) { cacheStack.add(entry); } - + public Entry pop() { if (cacheStack.size() > 0) return (Entry) cacheStack.removeFirst(); else return null; } - + public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException { responseHeaderDB.set(urlHash, responseHeader); } - + private boolean deleteFile(File file) { if (file.exists()) { currCacheSize -= file.length(); @@ -158,11 +159,11 @@ public final class plasmaHTCache { return false; } } - + public boolean deleteFile(URL url) { return deleteFile(getCachePath(url)); } - + public boolean writeFile(URL url, byte[] array) { if (array == null) return false; File file = getCachePath(url); @@ -183,7 +184,7 @@ public final class plasmaHTCache { writeFileAnnouncement(file); return true; } - + public void writeFileAnnouncement(File file) { synchronized (cacheAge) { if (file.exists()) { @@ -193,101 +194,99 @@ public final class plasmaHTCache { } } } - + private void cleanup() { // clean up cache to have enough space for next entries File f; while ((currCacheSize > maxCacheSize) && (cacheAge.size() > 0)) { f = (File) cacheAge.remove(cacheAge.firstKey()); if ((f != null) && (f.exists())) { - currCacheSize -= f.length(); + long size = f.length(); + //currCacheSize -= f.length(); if (f.delete()) { log.logInfo("DELETED OLD CACHE : " + f.toString()); + currCacheSize -= size; f = f.getParentFile(); - if ((f.exists()) && (f.isDirectory())) { - // check size of directory - if (f.list().length == 0) { - // the directory has no files in it; delete it also - if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString()); - } + if (f.isDirectory() && (f.list().length == 0)) { + // the directory has no files in it; delete it also + if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString()); } } } } } - + public void close() throws IOException { responseHeaderDB.close(); } - + private String ageString(long date, File f) { - StringBuffer sb = new StringBuffer(32); - String s = Long.toHexString(date); - for (int i = s.length(); i < 16; i++) sb.append('0'); - sb.append(s); - s = Integer.toHexString(f.hashCode()); - for (int i = s.length(); i < 8; i++) sb.append('0'); - sb.append(s); - return sb.toString(); + StringBuffer sb = new StringBuffer(32); + String s = Long.toHexString(date); + for (int i = s.length(); i < 16; i++) sb.append('0'); + sb.append(s); + s = Integer.toHexString(f.hashCode()); + for (int i = s.length(); i < 8; i++) sb.append('0'); + sb.append(s); + return sb.toString(); } - - public void cacheScan() { - //log.logSystem("STARTING CACHE SCANNING"); - kelondroMScoreCluster doms = new kelondroMScoreCluster(); - int c = 0; - enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true); - File f; - while (ef.hasMoreElements()) { - c++; - f = (File) ef.nextElement(); - long d = f.lastModified(); - //System.out.println("Cache: " + dom(f)); - doms.incScore(dom(f)); - currCacheSize += f.length(); - cacheAge.put(ageString(d, f), f); - } - //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey())); - long ageHours = 0; - try { - ageHours = (System.currentTimeMillis() - + + public void cacheScan() { + //log.logSystem("STARTING CACHE SCANNING"); + kelondroMScoreCluster doms = new kelondroMScoreCluster(); + int c = 0; + enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true); + File f; + while (ef.hasMoreElements()) { + c++; + f = (File) ef.nextElement(); + long d = f.lastModified(); + //System.out.println("Cache: " + dom(f)); + doms.incScore(dom(f)); + currCacheSize += f.length(); + cacheAge.put(ageString(d, f), f); + } + //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey())); + long ageHours = 0; + try { + ageHours = (System.currentTimeMillis() - Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000; - } catch (NumberFormatException e) { - //e.printStackTrace(); - } - log.logSystem("CACHE SCANNED, CONTAINS " + c + - " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + - ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + - " OLD"); - cleanup(); - - // start to prefetch ip's from dns - String dom; - long start = System.currentTimeMillis(); - String ip, result = ""; - c = 0; - while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) { - dom = (String) doms.getMaxObject(); - ip = httpc.dnsResolve(dom); - if (ip == null) break; - result += ", " + dom + "=" + ip; - log.logSystem("PRE-FILLED " + dom + "=" + ip); - c++; - doms.deleteScore(dom); - // wait a short while to prevent that this looks like a DoS - try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {} - } - if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c + - " ADDRESSES: " + result.substring(2)); - } + } catch (NumberFormatException e) { + //e.printStackTrace(); + } + log.logSystem("CACHE SCANNED, CONTAINS " + c + + " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " + + ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD"); + cleanup(); - private String dom(File f) { - String s = f.toString().substring(cachePath.toString().length() + 1); - int p = s.indexOf("/"); - if (p < 0) p = s.indexOf("\\"); - if (p < 0) return null; - return s.substring(0, p); + // start to prefetch ip's from dns + String dom; + long start = System.currentTimeMillis(); + String ip, result = ""; + c = 0; + while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) { + dom = (String) doms.getMaxObject(); + ip = httpc.dnsResolve(dom); + if (ip == null) break; + result += ", " + dom + "=" + ip; + log.logSystem("PRE-FILLED " + dom + "=" + ip); + c++; + doms.deleteScore(dom); + // wait a short while to prevent that this looks like a DoS + try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {} } - + if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c + + " ADDRESSES: " + result.substring(2)); + } + + private String dom(File f) { + String s = f.toString().substring(cachePath.toString().length() + 1); + int p = s.indexOf("/"); + if (p < 0) p = s.indexOf("\\"); + if (p < 0) return null; + return s.substring(0, p); + } + public httpHeader getCachedResponse(String urlHash) throws IOException { Map hdb = responseHeaderDB.get(urlHash); if (hdb == null) return null; @@ -295,19 +294,19 @@ public final class plasmaHTCache { } public boolean full() { - return (cacheStack.size() > stackLimit); + return (cacheStack.size() > stackLimit); } public boolean empty() { - return (cacheStack.size() == 0); + return (cacheStack.size() == 0); } - + public static boolean isPicture(httpHeader response) { Object ct = response.get(httpHeader.CONTENT_TYPE); if (ct == null) return false; return ((String)ct).toUpperCase().startsWith("IMAGE"); } - + public static boolean isText(httpHeader response) { // Object ct = response.get(httpHeader.CONTENT_TYPE); // if (ct == null) return false; @@ -336,64 +335,76 @@ public final class plasmaHTCache { // ); int idx = urlString.indexOf("?"); if (idx > 0) urlString = urlString.substring(0,idx); - + idx = urlString.lastIndexOf("."); if (idx > 0) urlString = urlString.substring(idx+1); - + return plasmaParser.mediaExtContains(urlString); } - - // this method creates from a given host and path a cache path + + /** + * this method creates from a given host and path a cache path + * from a given host (which may also be an IPv4 - number, but not IPv6 or + * a domain; all without leading 'http://') and a path (which must start + * with a leading '/', and may also end in an '/') a path to a file + * in the file system with root as given in cachePath is constructed + * it will also be ensured, that the complete path exists; if necessary + * that path will be generated + * @return URL + */ public File getCachePath(URL url) { - // from a given host (which may also be an IPv4 - number, but not IPv6 or - // a domain; all without leading 'http://') and a path (which must start - // with a leading '/', and may also end in an '/') a path to a file - // in the file system with root as given in cachePath is constructed - // it will also be ensured, that the complete path exists; if necessary - // that path will be generated - //System.out.println("DEBUG: getCachedPath=" + url.toString()); - String remotePath = url.getPath(); - if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath; - if (remotePath.endsWith("/")) remotePath = remotePath + "ndx"; + // System.out.println("DEBUG: getCachePath: IN=" + url.toString()); + String remotePath = url.getPath(); + if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath; + if (remotePath.endsWith("/")) remotePath = remotePath + "ndx"; if (remotePath.indexOf('#') > 0) remotePath.substring(0, remotePath.indexOf('#')); remotePath = remotePath.replace('?', '_'); remotePath = remotePath.replace('&', '_'); // yes this is not reversible, but that is not needed remotePath = remotePath.replace(':', '_'); // yes this is not reversible, but that is not needed - int port = url.getPort(); - if (port < 0) port = 80; - return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath); + int port = url.getPort(); + if (port < 0) port = 80; + // System.out.println("DEBUG: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath); + return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath); } + /** + * this is the reverse function to getCachePath: it constructs the url as string + * from a given storage path + */ public static URL getURL(File cachePath, File f) { - // this is the reverse function to getCachePath: it constructs the url as string - // from a given storage path - String s = f.toString().replace('\\', '/'); - String c = cachePath.toString().replace('\\', '/'); - //System.out.println("DEBUG: getURL for c=" + c + ", s=" + s); - int p = s.lastIndexOf(c); - if (p >= 0) { - s = s.substring(p + c.length()); - while (s.startsWith("/")) s = s.substring(1); - if ((p = s.indexOf("+")) >= 0) { + // System.out.println("DEBUG: getURL: IN: Path=[" + cachePath + "]"); + // System.out.println("DEBUG: getURL: IN: File=[" + f + "]"); + String s = f.toString().replace('\\', '/'); + String c = cachePath.toString().replace('\\', '/'); + int p = s.lastIndexOf(c); + if (p >= 0) { + s = s.substring(p + c.length()); + while (s.startsWith("/")) s = s.substring(1); + if ((p = s.indexOf("+")) >= 0) { s = s.substring(0, p) + ":" + s.substring(p + 1); - } else { +/* } else { p = s.indexOf("/"); if (p < 0) s = s + ":80/"; else - s = s.substring(0, p) + ":80" + s.substring(p); + s = s.substring(0, p) + ":80" + s.substring(p);*/ } - if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3); - //System.out.println("DEBUG: getURL url=" + s); + if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3); + // System.out.println("DEBUG: getURL: OUT=" + s); + try { +/* URL url = null; + url = new URL("http://" + s); + System.out.println("DEBUG: getURL: URL=" + url.toString()); + return url;//new URL("http://" + s); */ return new URL("http://" + s); } catch (Exception e) { return null; } - } - return null; + } + return null; } - + public byte[] loadResource(URL url) { // load the url as resource from the cache File f = getCachePath(url); @@ -405,10 +416,10 @@ public final class plasmaHTCache { return null; } } - + public static boolean isPOST(String urlString) { - return ((urlString.indexOf("?") >= 0) || - (urlString.indexOf("&") >= 0)); + return ((urlString.indexOf("?") >= 0) || + (urlString.indexOf("&") >= 0)); } public static boolean isCGI(String urlString) { @@ -421,8 +432,8 @@ public final class plasmaHTCache { } public Entry newEntry(Date initDate, int depth, URL url, String name, - httpHeader requestHeader, - String responseStatus, httpHeader responseHeader, + httpHeader requestHeader, + String responseStatus, httpHeader responseHeader, String initiator, plasmaCrawlProfile.entry profile) { return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile); @@ -430,108 +441,108 @@ public final class plasmaHTCache { public final class Entry { - // the class objects - public Date initDate; // the date when the request happened; will be used as a key - public int depth; // the depth of prefetching - public httpHeader requestHeader; // we carry also the header to prevent too many file system access - public String responseStatus; - public httpHeader responseHeader; // we carry also the header to prevent too many file system access - public File cacheFile; // the cache file - public byte[] cacheArray; // or the cache as byte-array - public URL url; - public String name; // the name of the link, read as anchor from an -tag - public String nomalizedURLHash; - public String nomalizedURLString; - public int status; // cache load/hit/stale etc status - public Date lastModified; - public char doctype; - public String language; - public plasmaCrawlProfile.entry profile; - private String initiator; + // the class objects + public Date initDate; // the date when the request happened; will be used as a key + public int depth; // the depth of prefetching + public httpHeader requestHeader; // we carry also the header to prevent too many file system access + public String responseStatus; + public httpHeader responseHeader; // we carry also the header to prevent too many file system access + public File cacheFile; // the cache file + public byte[] cacheArray; // or the cache as byte-array + public URL url; + public String name; // the name of the link, read as anchor from an -tag + public String nomalizedURLHash; + public String nomalizedURLString; + public int status; // cache load/hit/stale etc status + public Date lastModified; + public char doctype; + public String language; + public plasmaCrawlProfile.entry profile; + private String initiator; - - public Entry(Date initDate, int depth, URL url, String name, - httpHeader requestHeader, - String responseStatus, httpHeader responseHeader, - String initiator, - plasmaCrawlProfile.entry profile) { - - // normalize url - this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url); - try { - this.url = new URL(nomalizedURLString); - } catch (MalformedURLException e) { - System.out.println("internal error at httpdProxyCache.Entry: " + e); - System.exit(-1); - } - this.name = name; - this.cacheFile = getCachePath(this.url); - this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); - - // assigned: - this.initDate = initDate; - this.depth = depth; - this.requestHeader = requestHeader; - this.responseStatus = responseStatus; - this.responseHeader = responseHeader; - this.profile = profile; - this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); - - // calculated: - if (responseHeader == null) { - try { - throw new RuntimeException("RESPONSE HEADER = NULL"); - } catch (Exception e) { - System.out.println("RESPONSE HEADER = NULL in " + url); - e.printStackTrace(); - System.exit(0); - } - - lastModified = serverDate.correctedGMTDate(); - } else { - lastModified = responseHeader.lastModified(); - if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header - } - this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime()); - if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url); - this.language = plasmaWordIndexEntry.language(url); + public Entry(Date initDate, int depth, URL url, String name, + httpHeader requestHeader, + String responseStatus, httpHeader responseHeader, + String initiator, + plasmaCrawlProfile.entry profile) { - // to be defined later: - this.cacheArray = null; - } - - public String name() { - return name; - } - public String initiator() { - return initiator; + // normalize url - Borg-0300 + serverLog.logDebug("PLASMA", "Entry: URL=" + url.toString()); + this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url); + try { + this.url = new URL(nomalizedURLString); + } catch (MalformedURLException e) { + System.out.println("internal error at httpdProxyCache.Entry: " + e); + System.exit(-1); } - public boolean proxy() { - return initiator() == null; + this.name = name; + this.cacheFile = getCachePath(this.url); + this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString); + + // assigned: + this.initDate = initDate; + this.depth = depth; + this.requestHeader = requestHeader; + this.responseStatus = responseStatus; + this.responseHeader = responseHeader; + this.profile = profile; + this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator); + + // calculated: + if (responseHeader == null) { + try { + throw new RuntimeException("RESPONSE HEADER = NULL"); + } catch (Exception e) { + System.out.println("RESPONSE HEADER = NULL in " + url); + e.printStackTrace(); + System.exit(0); + } + + lastModified = serverDate.correctedGMTDate(); + } else { + lastModified = responseHeader.lastModified(); + if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header } - public long size() { - if (cacheArray == null) return 0; else return cacheArray.length; - } + this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime()); + if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url); + this.language = plasmaWordIndexEntry.language(url); - public URL referrerURL() { - if (requestHeader == null) return null; - try { - return new URL((String) requestHeader.get(httpHeader.REFERER, "")); - } catch (Exception e) { - return null; - } + // to be defined later: + this.cacheArray = null; + } + + public String name() { + return name; + } + public String initiator() { + return initiator; + } + public boolean proxy() { + return initiator() == null; + } + public long size() { + if (cacheArray == null) return 0; else return cacheArray.length; + } + + public URL referrerURL() { + if (requestHeader == null) return null; + try { + return new URL((String) requestHeader.get(httpHeader.REFERER, "")); + } catch (Exception e) { + return null; } - - /* + } + + /* public boolean update() { return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD)); } */ - // the following three methods for cache read/write granting shall be as loose as possible - // but also as strict as necessary to enable caching of most items + // the following three methods for cache read/write granting shall be as loose as possible + // but also as strict as necessary to enable caching of most items - public String shallStoreCacheForProxy() { + public String shallStoreCacheForProxy() { // returns NULL if the answer is TRUE // in case of FALSE, the reason as String is returned diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5887635b9..d1056d393 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -100,25 +100,25 @@ package de.anomic.plasma; -import java.io.BufferedReader; +// import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.File; -import java.io.FileInputStream; +// import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStreamReader; +// import java.io.InputStreamReader; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; import java.util.Date; -import java.util.Enumeration; +// import java.util.Enumeration; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.TreeSet; -import java.util.TreeMap; -import java.util.Vector; +// import java.util.TreeMap; +// import java.util.Vector; import de.anomic.data.messageBoard; import de.anomic.data.wikiBoard; @@ -130,24 +130,23 @@ import de.anomic.kelondro.kelondroTables; import de.anomic.server.serverAbstractSwitch; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; -import de.anomic.server.serverDate; +// import de.anomic.server.serverDate; import de.anomic.server.serverInstantThread; import de.anomic.server.serverObjects; import de.anomic.server.serverSemaphore; import de.anomic.server.serverSwitch; import de.anomic.server.logging.serverLog; -import de.anomic.server.serverFileUtils; +// import de.anomic.server.serverFileUtils; import de.anomic.tools.bitfield; import de.anomic.tools.crypt; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySeed; -import de.anomic.yacy.yacySeedDB; +// import de.anomic.yacy.yacySeedDB; public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch { - // load slots public static int crawlSlots = 10; public static int indexingSlots = 100; @@ -158,7 +157,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public static plasmaURLPattern urlBlacklist; // storage management - private File cachePath; + private File cachePath; // do we need that ? private File plasmaPath; public File listsPath; public plasmaURLPool urlPool; @@ -190,15 +189,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser private Object crawlingPausedSync = new Object(); private boolean crawlingIsPaused = false; private static plasmaSwitchboard sb; - + public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException { - super(rootPath, initPath, configPath); - + super(rootPath, initPath, configPath); + // set loglevel and log - setLog(new serverLog("PLASMA")); - - // load values from configs - plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB")); + setLog(new serverLog("PLASMA")); + + // load values from configs + plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB")); listsPath = new File(rootPath, getConfig("listsPath", "LISTS")); remoteProxyHost = getConfig("remoteProxyHost", ""); try { @@ -216,14 +215,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser proxyLastAccess = System.currentTimeMillis() - 60000; if (!(listsPath.exists())) listsPath.mkdirs(); - - // load coloured lists - if (blueList == null) { - // read only once upon first instantiation of this class - String f = getConfig("plasmaBlueList", null); - if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet(); - } - + + // load coloured lists + if (blueList == null) { + // read only once upon first instantiation of this class + String f = getConfig("plasmaBlueList", null); + if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet(); + } + // load the black-list / inspired by [AS] urlBlacklist = new plasmaURLPattern(new File(getRootPath(), getConfig("listsPath", "DATA/LISTS"))); String f = getConfig("proxyBlackListsActive", null); @@ -237,8 +236,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (stopwords == null) { stopwords = kelondroMSetTools.loadList(new File(rootPath, "yacy.stopwords")); } - - // read memory amount + + // read memory amount int ramLURL = Integer.parseInt(getConfig("ramCacheLURL", "1024")) / 1024; int ramNURL = Integer.parseInt(getConfig("ramCacheNURL", "1024")) / 1024; int ramEURL = Integer.parseInt(getConfig("ramCacheEURL", "1024")) / 1024; @@ -254,7 +253,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser log.logSystem("Message Cache memory = " + ppRamString(ramMessage)); log.logSystem("Wiki Cache memory = " + ppRamString(ramWiki)); - // make crawl profiles database and default profiles + // make crawl profiles database and default profiles log.logSystem("Initializing Crawl Profiles"); profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db")); initProfiles(); @@ -270,7 +269,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start a cache manager log.logSystem("Starting HT Cache Manager"); - File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE")); + + // create the Cache directorie - Borg-0300 + String cp = getConfig("proxyCache", "DATA/HTCACHE"); + cp = cp.replace('\\', '/'); + if (cp.endsWith("/")) cp = cp.substring(0,cp.length() - 1); + File htCachePath = new File(cp); + if (!(htCachePath.exists())) htCachePath.mkdirs(); + if (!(htCachePath.isDirectory())) { + // if the cache does not exists or is a file and not a directory, panic + serverLog.logSystem("PLASMA", "the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created"); + System.exit(0); + } else { + serverLog.logInfo("PLASMA", "proxyCache=" + cp); + } + long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP); @@ -309,7 +322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser messageDB = new messageBoard(new File(getRootPath(), "DATA/SETTINGS/message.db"), ramMessage); log.logSystem("Starting Wiki Board"); wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"), - new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki); + new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki); // init cookie-Monitor log.logSystem("Starting Cookie Monitor");