diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 98ffc74d0..10491a795 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -50,7 +50,6 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
-
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
@@ -67,18 +66,20 @@ public class CacheAdmin_p {
private static SimpleDateFormat SimpleFormatter = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
public static String dateString(Date date) {
- return SimpleFormatter.format(date);
+ return SimpleFormatter.format(date);
}
-
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
String action = ((post == null) ? "info" : post.get("action", "info"));
- String pathString = ((post == null) ? "" : post.get("path", "/"));
+ String pathString = ((post == null) ? "" : post.get("path", "/"));
String fileString = pathString;
- File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
+
+ // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
+ File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
+
File file = new File(cache, pathString);
File dir;
URL url = plasmaHTCache.getURL(cache, file);
@@ -89,6 +90,7 @@ public class CacheAdmin_p {
dir = file.getParentFile();
pathString = (new File(pathString)).getParent().replace('\\','/');
}
+
// generate dir listing
String[] list = dir.list();
File f; String tree = "Directory of
" + ((pathString.length() == 0) ? "domain list" : linkPathString(pathString)) + "
";
@@ -97,18 +99,19 @@ public class CacheAdmin_p {
else {
for (int i = 0; i < list.length; i++) {
f = new File(dir, list[i]);
- if (f.isDirectory())
- tree += "
" + list[i] + "
" + serverCore.crlfString;
- else
- tree += "
" + list[i] + "
" + serverCore.crlfString;
+ if (!f.getName().equalsIgnoreCase("responseHeader.db"))
+ if (f.isDirectory())
+ tree += "
" + list[i] + "
" + serverCore.crlfString;
+ else
+ tree += "
" + list[i] + "
" + serverCore.crlfString;
}
}
-
+
String info = "";
if (action.equals("info")) {
if (!(file.isDirectory())) {
- String urls = htmlFilterContentScraper.urlNormalform(url);
+ String urls = htmlFilterContentScraper.urlNormalform(url);
info += "Info for URL " + urls + ":
";
try {
httpHeader fileheader = switchboard.cacheManager.getCachedResponse(plasmaURL.urlHash(url));
@@ -140,14 +143,13 @@ public class CacheAdmin_p {
}
}
}
-
- //
- prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024));
- prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
+
+ prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024));
+ prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("tree", tree);
prop.put("info", info);
// return rewrite properties
- return prop;
+ return prop;
}
private static String formatHeader(httpHeader header) {
@@ -193,5 +195,4 @@ public class CacheAdmin_p {
}
return result;
}
-
}
diff --git a/htroot/CacheResource_p.java b/htroot/CacheResource_p.java
index 2bb9b0797..6f7660fe1 100644
--- a/htroot/CacheResource_p.java
+++ b/htroot/CacheResource_p.java
@@ -41,7 +41,7 @@
// You must compile this file with
-// javac -classpath .:../Classes Message.java
+// javac -classpath .:../classes CacheResource_p.java
// if the shell's current path is HTROOT
import java.io.File;
@@ -56,11 +56,14 @@ import de.anomic.server.serverSwitch;
public class CacheResource_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
- serverObjects prop = new serverObjects();
+ plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
+ serverObjects prop = new serverObjects();
+
+ String path = ((post == null) ? "" : post.get("path", ""));
+
+ // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
+ File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
- String path = ((post == null) ? "" : post.get("path", ""));
- File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
File f = new File(cache, path);
byte[] resource;
@@ -70,7 +73,6 @@ public class CacheResource_p {
} catch (IOException e) {
prop.put("resource", new byte[0]);
}
- return prop;
+ return prop;
}
-
}
diff --git a/htroot/ProxyIndexingMonitor_p.html b/htroot/ProxyIndexingMonitor_p.html
index 3b3140b48..ae495660c 100644
--- a/htroot/ProxyIndexingMonitor_p.html
+++ b/htroot/ProxyIndexingMonitor_p.html
@@ -18,12 +18,16 @@ and automatically excluded from indexing.
+
#(info)#
+
::
-
The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
+
+The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
Please delete that file and restart.
::
-
-Proxy pre-fetch is now set to depth-#[message]#.
-Proxy caching is now set #(caching)#off::on#(/caching)#.
+
+Pre-fetch is now set to depth-#[message]#.
+Caching is now #(caching)#off::on#(/caching)#.
+#(path)#::Cachepath is now set to '#[return]#'. Please move the old data in the new directory.
#(/path)#
+#(size)#::Cachesize is now set to #[return]#MB.
#(/size)#
+#(restart)#::
Changes will take effect after restart only.
#(/restart)#
::
-
An error has occurred: #[error]#.
+
+An error has occurred: #[error]#.
#(/info)#
+
You can see a snapshot of recently indexed pages
on the Proxy Index Monitor Page.
diff --git a/htroot/ProxyIndexingMonitor_p.java b/htroot/ProxyIndexingMonitor_p.java
index 596e72505..0aaf83d61 100644
--- a/htroot/ProxyIndexingMonitor_p.java
+++ b/htroot/ProxyIndexingMonitor_p.java
@@ -40,13 +40,14 @@
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
-// javac -classpath .:../Classes Settings_p.java
+// javac -classpath .:../classes ProxyIndexingMonitor_p.java
// if the shell's current path is HTROOT
+import java.io.File;
import java.io.IOException;
-import java.text.SimpleDateFormat;
-import java.util.Date;
-import java.util.Locale;
+// import java.text.SimpleDateFormat;
+// import java.util.Date;
+// import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile;
@@ -54,28 +55,30 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
-import de.anomic.yacy.yacyCore;
public class ProxyIndexingMonitor_p {
- private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
- private static String daydate(Date date) {
- if (date == null) return ""; else return dayFormatter.format(date);
- }
-
+// private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
+// private static String daydate(Date date) {
+// if (date == null) return ""; else return dayFormatter.format(date);
+// }
+
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
- // return variable that accumulates replacements
+ // return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
- serverObjects prop = new serverObjects();
-
- int showIndexedCount = 20;
- boolean se = false;
-
+ serverObjects prop = new serverObjects();
+
+// int showIndexedCount = 20;
+// boolean se = false;
+
+ String oldProxyCache, newProxyCache;
+ String oldProxyCacheSize, newProxyCacheSize;
+
prop.put("info", 0);
prop.put("info_message", "");
-
+
if (post != null) {
-
+
if (post.containsKey("proxyprofileset")) try {
// read values and put them in global settings
int newProxyPrefetchDepth = Integer.parseInt((String) post.get("proxyPrefetchDepth", "0"));
@@ -83,34 +86,73 @@ public class ProxyIndexingMonitor_p {
boolean proxyStoreHTCache = ((String) post.get("proxyStoreHTCache", "")).equals("on");
env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false");
+ // added proxyCache, proxyCacheSize - Borg-0300
+ // proxyCache - check and create the directory
+ oldProxyCache = env.getConfig("proxyCache", "DATA/HTCACHE");
+ newProxyCache = ((String) post.get("proxyCache", "DATA/HTCACHE"));
+ newProxyCache = newProxyCache.replace("\\", "/");
+ if (newProxyCache.endsWith("/")) newProxyCache.substring(0, newProxyCache.length() - 1);
+ File cp = new File(newProxyCache);
+ if ((!cp.isDirectory()) && (!cp.isFile())) cp.mkdirs();
+ env.setConfig("proxyCache", newProxyCache);
+
+ // proxyCacheSize
+ oldProxyCacheSize = Integer.toString(Integer.parseInt(env.getConfig("proxyCacheSize", "64")));
+ newProxyCacheSize = Integer.toString(Integer.parseInt((String) post.get("proxyCacheSize", "64")));
+ env.setConfig("proxyCacheSize", newProxyCacheSize);
+
// implant these settings also into the crawling profile for the proxy
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(switchboard.getConfig("defaultProxyProfile", ""));
if (profile == null) {
- prop.put("info", 1);//delete DATA/PLASMADB/crawlProfiles0.db
+ prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
- profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
+ profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
- prop.put("info", 2);//new proxyPrefetchdepth
- prop.put("info_message", newProxyPrefetchDepth);
+ prop.put("info", 2);//new proxyPrefetchdepth
+ prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0);
+
+ // proxyCache - only display on change
+ if (oldProxyCache.compareTo(newProxyCache) == 0) {
+ prop.put("info_path", 0);
+ prop.put("info_path_return", oldProxyCache);
+ } else {
+ prop.put("info_path", 1);
+ prop.put("info_path_return", newProxyCache);
+ }
+ // proxyCacheSize - only display on change
+ if (oldProxyCacheSize.compareTo(newProxyCacheSize) == 0) {
+ prop.put("info_size", 0);
+ prop.put("info_size_return", oldProxyCacheSize);
+ } else {
+ prop.put("info_size", 1);
+ prop.put("info_size_return", newProxyCacheSize);
+ }
+ // proxyCache, proxyCacheSize we need a restart
+ prop.put("info_restart", 0);
+ prop.put("info_restart_return", 0);
+ if (oldProxyCache.compareTo(newProxyCache) != 0) prop.put("info_restart", 1);
+ if (oldProxyCacheSize.compareTo(newProxyCacheSize) != 0) prop.put("info_restart", 1);
+
} catch (IOException e) {
- prop.put("info", 3); //Error: errmsg
- prop.put("info_error", e.getMessage());
+ prop.put("info", 3); //Error: errmsg
+ prop.put("info_error", e.getMessage());
}
}
-
+
} catch (Exception e) {
prop.put("info", 2); //Error: errmsg
prop.put("info_error", e.getMessage());
serverLog.logError("SERVLET", "ProxyIndexingMonitor.case3", e);
}
}
-
+
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
+ prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE"));
+ prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64"));
// return rewrite properties
- return prop;
+ return prop;
}
-
}
diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 1d4f0598b..883ef253a 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -113,8 +113,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
int p;
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
- p = us.indexOf(":80/");
- if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
+ if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
return us;
}
diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java
index 15c431b08..11cca0fb0 100644
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@@ -59,9 +59,9 @@ import java.util.Date;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
-import java.util.Calendar;
-import java.util.GregorianCalendar;
-import java.util.TimeZone;
+//import java.util.Calendar;
+//import java.util.GregorianCalendar;
+//import java.util.TimeZone;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
@@ -79,7 +79,7 @@ public final class plasmaHTCache {
private static final int stackLimit = 150; // if we exceed that limit, we do not check idle
public static final long oneday = 1000 * 60 * 60 * 24; // milliseconds of a day
-
+
private kelondroMap responseHeaderDB = null;
private final LinkedList cacheStack;
private final TreeMap cacheAge; // a - relation
@@ -89,67 +89,68 @@ public final class plasmaHTCache {
public static serverLog log;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) {
- //this.switchboard = switchboard;
-
+ // this.switchboard = switchboard;
+
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
this.maxCacheSize = maxCacheSize;
-
- // set cache path
- if (!(htCachePath.exists())) {
- // make the cache path
- htCachePath.mkdir();
- }
- if (!(htCachePath.isDirectory())) {
- // if the cache does not exists or is a file and not a directory, panic
- System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
- System.exit(0);
- }
- // open the response header database
- File dbfile = new File(cachePath, "responseHeader.db");
- try {
+ // we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
+/* // set cache path
+ if (!(htCachePath.exists())) {
+ // make the cache path
+ htCachePath.mkdir();
+ }
+ if (!(htCachePath.isDirectory())) {
+ // if the cache does not exists or is a file and not a directory, panic
+ System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
+ System.exit(0);
+ }*/
+
+ // open the response header database
+ File dbfile = new File(cachePath, "responseHeader.db");
+ try {
if (dbfile.exists())
- responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400));
- else
- responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150));
- } catch (IOException e) {
- System.out.println("the request header database could not be opened: " + e.getMessage());
- System.exit(0);
- }
+ responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400));
+ else
+ responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150));
+ } catch (IOException e) {
+ System.out.println("the request header database could not be opened: " + e.getMessage());
+ System.exit(0);
+ }
- // init stack
- cacheStack = new LinkedList();
+ // init stack
+ cacheStack = new LinkedList();
// init cache age and size management
- cacheAge = new TreeMap();
- currCacheSize = 0;
- this.maxCacheSize = maxCacheSize;
-
- // start the cache startup thread
- // this will collect information about the current cache size and elements
- serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000);
+ cacheAge = new TreeMap();
+ currCacheSize = 0;
+ this.maxCacheSize = maxCacheSize;
+
+ // start the cache startup thread
+ // this will collect information about the current cache size and elements
+ serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000);
}
-
+
public int size() {
return cacheStack.size();
}
-
+
public void push(Entry entry) {
cacheStack.add(entry);
}
-
+
public Entry pop() {
if (cacheStack.size() > 0)
return (Entry) cacheStack.removeFirst();
else
return null;
}
-
+
public void storeHeader(String urlHash, httpHeader responseHeader) throws IOException {
responseHeaderDB.set(urlHash, responseHeader);
}
-
+
private boolean deleteFile(File file) {
if (file.exists()) {
currCacheSize -= file.length();
@@ -158,11 +159,11 @@ public final class plasmaHTCache {
return false;
}
}
-
+
public boolean deleteFile(URL url) {
return deleteFile(getCachePath(url));
}
-
+
public boolean writeFile(URL url, byte[] array) {
if (array == null) return false;
File file = getCachePath(url);
@@ -183,7 +184,7 @@ public final class plasmaHTCache {
writeFileAnnouncement(file);
return true;
}
-
+
public void writeFileAnnouncement(File file) {
synchronized (cacheAge) {
if (file.exists()) {
@@ -193,101 +194,99 @@ public final class plasmaHTCache {
}
}
}
-
+
private void cleanup() {
// clean up cache to have enough space for next entries
File f;
while ((currCacheSize > maxCacheSize) && (cacheAge.size() > 0)) {
f = (File) cacheAge.remove(cacheAge.firstKey());
if ((f != null) && (f.exists())) {
- currCacheSize -= f.length();
+ long size = f.length();
+ //currCacheSize -= f.length();
if (f.delete()) {
log.logInfo("DELETED OLD CACHE : " + f.toString());
+ currCacheSize -= size;
f = f.getParentFile();
- if ((f.exists()) && (f.isDirectory())) {
- // check size of directory
- if (f.list().length == 0) {
- // the directory has no files in it; delete it also
- if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
- }
+ if (f.isDirectory() && (f.list().length == 0)) {
+ // the directory has no files in it; delete it also
+ if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
}
}
}
}
-
+
public void close() throws IOException {
responseHeaderDB.close();
}
-
+
private String ageString(long date, File f) {
- StringBuffer sb = new StringBuffer(32);
- String s = Long.toHexString(date);
- for (int i = s.length(); i < 16; i++) sb.append('0');
- sb.append(s);
- s = Integer.toHexString(f.hashCode());
- for (int i = s.length(); i < 8; i++) sb.append('0');
- sb.append(s);
- return sb.toString();
+ StringBuffer sb = new StringBuffer(32);
+ String s = Long.toHexString(date);
+ for (int i = s.length(); i < 16; i++) sb.append('0');
+ sb.append(s);
+ s = Integer.toHexString(f.hashCode());
+ for (int i = s.length(); i < 8; i++) sb.append('0');
+ sb.append(s);
+ return sb.toString();
}
-
- public void cacheScan() {
- //log.logSystem("STARTING CACHE SCANNING");
- kelondroMScoreCluster doms = new kelondroMScoreCluster();
- int c = 0;
- enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true);
- File f;
- while (ef.hasMoreElements()) {
- c++;
- f = (File) ef.nextElement();
- long d = f.lastModified();
- //System.out.println("Cache: " + dom(f));
- doms.incScore(dom(f));
- currCacheSize += f.length();
- cacheAge.put(ageString(d, f), f);
- }
- //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey()));
- long ageHours = 0;
- try {
- ageHours = (System.currentTimeMillis() -
+
+ public void cacheScan() {
+ //log.logSystem("STARTING CACHE SCANNING");
+ kelondroMScoreCluster doms = new kelondroMScoreCluster();
+ int c = 0;
+ enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true);
+ File f;
+ while (ef.hasMoreElements()) {
+ c++;
+ f = (File) ef.nextElement();
+ long d = f.lastModified();
+ //System.out.println("Cache: " + dom(f));
+ doms.incScore(dom(f));
+ currCacheSize += f.length();
+ cacheAge.put(ageString(d, f), f);
+ }
+ //System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey()));
+ long ageHours = 0;
+ try {
+ ageHours = (System.currentTimeMillis() -
Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000;
- } catch (NumberFormatException e) {
- //e.printStackTrace();
- }
- log.logSystem("CACHE SCANNED, CONTAINS " + c +
- " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
- ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) +
- " OLD");
- cleanup();
-
- // start to prefetch ip's from dns
- String dom;
- long start = System.currentTimeMillis();
- String ip, result = "";
- c = 0;
- while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) {
- dom = (String) doms.getMaxObject();
- ip = httpc.dnsResolve(dom);
- if (ip == null) break;
- result += ", " + dom + "=" + ip;
- log.logSystem("PRE-FILLED " + dom + "=" + ip);
- c++;
- doms.deleteScore(dom);
- // wait a short while to prevent that this looks like a DoS
- try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {}
- }
- if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c +
- " ADDRESSES: " + result.substring(2));
- }
+ } catch (NumberFormatException e) {
+ //e.printStackTrace();
+ }
+ log.logSystem("CACHE SCANNED, CONTAINS " + c +
+ " FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
+ ((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD");
+ cleanup();
- private String dom(File f) {
- String s = f.toString().substring(cachePath.toString().length() + 1);
- int p = s.indexOf("/");
- if (p < 0) p = s.indexOf("\\");
- if (p < 0) return null;
- return s.substring(0, p);
+ // start to prefetch ip's from dns
+ String dom;
+ long start = System.currentTimeMillis();
+ String ip, result = "";
+ c = 0;
+ while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) {
+ dom = (String) doms.getMaxObject();
+ ip = httpc.dnsResolve(dom);
+ if (ip == null) break;
+ result += ", " + dom + "=" + ip;
+ log.logSystem("PRE-FILLED " + dom + "=" + ip);
+ c++;
+ doms.deleteScore(dom);
+ // wait a short while to prevent that this looks like a DoS
+ try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {}
}
-
+ if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c +
+ " ADDRESSES: " + result.substring(2));
+ }
+
+ private String dom(File f) {
+ String s = f.toString().substring(cachePath.toString().length() + 1);
+ int p = s.indexOf("/");
+ if (p < 0) p = s.indexOf("\\");
+ if (p < 0) return null;
+ return s.substring(0, p);
+ }
+
public httpHeader getCachedResponse(String urlHash) throws IOException {
Map hdb = responseHeaderDB.get(urlHash);
if (hdb == null) return null;
@@ -295,19 +294,19 @@ public final class plasmaHTCache {
}
public boolean full() {
- return (cacheStack.size() > stackLimit);
+ return (cacheStack.size() > stackLimit);
}
public boolean empty() {
- return (cacheStack.size() == 0);
+ return (cacheStack.size() == 0);
}
-
+
public static boolean isPicture(httpHeader response) {
Object ct = response.get(httpHeader.CONTENT_TYPE);
if (ct == null) return false;
return ((String)ct).toUpperCase().startsWith("IMAGE");
}
-
+
public static boolean isText(httpHeader response) {
// Object ct = response.get(httpHeader.CONTENT_TYPE);
// if (ct == null) return false;
@@ -336,64 +335,76 @@ public final class plasmaHTCache {
// );
int idx = urlString.indexOf("?");
if (idx > 0) urlString = urlString.substring(0,idx);
-
+
idx = urlString.lastIndexOf(".");
if (idx > 0) urlString = urlString.substring(idx+1);
-
+
return plasmaParser.mediaExtContains(urlString);
}
-
- // this method creates from a given host and path a cache path
+
+ /**
+ * this method creates from a given host and path a cache path
+ * from a given host (which may also be an IPv4 - number, but not IPv6 or
+ * a domain; all without leading 'http://') and a path (which must start
+ * with a leading '/', and may also end in an '/') a path to a file
+ * in the file system with root as given in cachePath is constructed
+ * it will also be ensured, that the complete path exists; if necessary
+ * that path will be generated
+ * @return URL
+ */
public File getCachePath(URL url) {
- // from a given host (which may also be an IPv4 - number, but not IPv6 or
- // a domain; all without leading 'http://') and a path (which must start
- // with a leading '/', and may also end in an '/') a path to a file
- // in the file system with root as given in cachePath is constructed
- // it will also be ensured, that the complete path exists; if necessary
- // that path will be generated
- //System.out.println("DEBUG: getCachedPath=" + url.toString());
- String remotePath = url.getPath();
- if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
- if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
+ // System.out.println("DEBUG: getCachePath: IN=" + url.toString());
+ String remotePath = url.getPath();
+ if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
+ if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
if (remotePath.indexOf('#') > 0) remotePath.substring(0, remotePath.indexOf('#'));
remotePath = remotePath.replace('?', '_');
remotePath = remotePath.replace('&', '_'); // yes this is not reversible, but that is not needed
remotePath = remotePath.replace(':', '_'); // yes this is not reversible, but that is not needed
- int port = url.getPort();
- if (port < 0) port = 80;
- return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
+ int port = url.getPort();
+ if (port < 0) port = 80;
+ // System.out.println("DEBUG: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
+ return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
}
+ /**
+ * this is the reverse function to getCachePath: it constructs the url as string
+ * from a given storage path
+ */
public static URL getURL(File cachePath, File f) {
- // this is the reverse function to getCachePath: it constructs the url as string
- // from a given storage path
- String s = f.toString().replace('\\', '/');
- String c = cachePath.toString().replace('\\', '/');
- //System.out.println("DEBUG: getURL for c=" + c + ", s=" + s);
- int p = s.lastIndexOf(c);
- if (p >= 0) {
- s = s.substring(p + c.length());
- while (s.startsWith("/")) s = s.substring(1);
- if ((p = s.indexOf("+")) >= 0) {
+ // System.out.println("DEBUG: getURL: IN: Path=[" + cachePath + "]");
+ // System.out.println("DEBUG: getURL: IN: File=[" + f + "]");
+ String s = f.toString().replace('\\', '/');
+ String c = cachePath.toString().replace('\\', '/');
+ int p = s.lastIndexOf(c);
+ if (p >= 0) {
+ s = s.substring(p + c.length());
+ while (s.startsWith("/")) s = s.substring(1);
+ if ((p = s.indexOf("+")) >= 0) {
s = s.substring(0, p) + ":" + s.substring(p + 1);
- } else {
+/* } else {
p = s.indexOf("/");
if (p < 0)
s = s + ":80/";
else
- s = s.substring(0, p) + ":80" + s.substring(p);
+ s = s.substring(0, p) + ":80" + s.substring(p);*/
}
- if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3);
- //System.out.println("DEBUG: getURL url=" + s);
+ if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3);
+ // System.out.println("DEBUG: getURL: OUT=" + s);
+
try {
+/* URL url = null;
+ url = new URL("http://" + s);
+ System.out.println("DEBUG: getURL: URL=" + url.toString());
+ return url;//new URL("http://" + s); */
return new URL("http://" + s);
} catch (Exception e) {
return null;
}
- }
- return null;
+ }
+ return null;
}
-
+
public byte[] loadResource(URL url) {
// load the url as resource from the cache
File f = getCachePath(url);
@@ -405,10 +416,10 @@ public final class plasmaHTCache {
return null;
}
}
-
+
public static boolean isPOST(String urlString) {
- return ((urlString.indexOf("?") >= 0) ||
- (urlString.indexOf("&") >= 0));
+ return ((urlString.indexOf("?") >= 0) ||
+ (urlString.indexOf("&") >= 0));
}
public static boolean isCGI(String urlString) {
@@ -421,8 +432,8 @@ public final class plasmaHTCache {
}
public Entry newEntry(Date initDate, int depth, URL url, String name,
- httpHeader requestHeader,
- String responseStatus, httpHeader responseHeader,
+ httpHeader requestHeader,
+ String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
@@ -430,108 +441,108 @@ public final class plasmaHTCache {
public final class Entry {
- // the class objects
- public Date initDate; // the date when the request happened; will be used as a key
- public int depth; // the depth of prefetching
- public httpHeader requestHeader; // we carry also the header to prevent too many file system access
- public String responseStatus;
- public httpHeader responseHeader; // we carry also the header to prevent too many file system access
- public File cacheFile; // the cache file
- public byte[] cacheArray; // or the cache as byte-array
- public URL url;
- public String name; // the name of the link, read as anchor from an -tag
- public String nomalizedURLHash;
- public String nomalizedURLString;
- public int status; // cache load/hit/stale etc status
- public Date lastModified;
- public char doctype;
- public String language;
- public plasmaCrawlProfile.entry profile;
- private String initiator;
+ // the class objects
+ public Date initDate; // the date when the request happened; will be used as a key
+ public int depth; // the depth of prefetching
+ public httpHeader requestHeader; // we carry also the header to prevent too many file system access
+ public String responseStatus;
+ public httpHeader responseHeader; // we carry also the header to prevent too many file system access
+ public File cacheFile; // the cache file
+ public byte[] cacheArray; // or the cache as byte-array
+ public URL url;
+ public String name; // the name of the link, read as anchor from an -tag
+ public String nomalizedURLHash;
+ public String nomalizedURLString;
+ public int status; // cache load/hit/stale etc status
+ public Date lastModified;
+ public char doctype;
+ public String language;
+ public plasmaCrawlProfile.entry profile;
+ private String initiator;
-
- public Entry(Date initDate, int depth, URL url, String name,
- httpHeader requestHeader,
- String responseStatus, httpHeader responseHeader,
- String initiator,
- plasmaCrawlProfile.entry profile) {
-
- // normalize url
- this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
- try {
- this.url = new URL(nomalizedURLString);
- } catch (MalformedURLException e) {
- System.out.println("internal error at httpdProxyCache.Entry: " + e);
- System.exit(-1);
- }
- this.name = name;
- this.cacheFile = getCachePath(this.url);
- this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
-
- // assigned:
- this.initDate = initDate;
- this.depth = depth;
- this.requestHeader = requestHeader;
- this.responseStatus = responseStatus;
- this.responseHeader = responseHeader;
- this.profile = profile;
- this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
-
- // calculated:
- if (responseHeader == null) {
- try {
- throw new RuntimeException("RESPONSE HEADER = NULL");
- } catch (Exception e) {
- System.out.println("RESPONSE HEADER = NULL in " + url);
- e.printStackTrace();
- System.exit(0);
- }
-
- lastModified = serverDate.correctedGMTDate();
- } else {
- lastModified = responseHeader.lastModified();
- if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header
- }
- this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
- if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
- this.language = plasmaWordIndexEntry.language(url);
+ public Entry(Date initDate, int depth, URL url, String name,
+ httpHeader requestHeader,
+ String responseStatus, httpHeader responseHeader,
+ String initiator,
+ plasmaCrawlProfile.entry profile) {
- // to be defined later:
- this.cacheArray = null;
- }
-
- public String name() {
- return name;
- }
- public String initiator() {
- return initiator;
+ // normalize url - Borg-0300
+ serverLog.logDebug("PLASMA", "Entry: URL=" + url.toString());
+ this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
+ try {
+ this.url = new URL(nomalizedURLString);
+ } catch (MalformedURLException e) {
+ System.out.println("internal error at httpdProxyCache.Entry: " + e);
+ System.exit(-1);
}
- public boolean proxy() {
- return initiator() == null;
+ this.name = name;
+ this.cacheFile = getCachePath(this.url);
+ this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
+
+ // assigned:
+ this.initDate = initDate;
+ this.depth = depth;
+ this.requestHeader = requestHeader;
+ this.responseStatus = responseStatus;
+ this.responseHeader = responseHeader;
+ this.profile = profile;
+ this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
+
+ // calculated:
+ if (responseHeader == null) {
+ try {
+ throw new RuntimeException("RESPONSE HEADER = NULL");
+ } catch (Exception e) {
+ System.out.println("RESPONSE HEADER = NULL in " + url);
+ e.printStackTrace();
+ System.exit(0);
+ }
+
+ lastModified = serverDate.correctedGMTDate();
+ } else {
+ lastModified = responseHeader.lastModified();
+ if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header
}
- public long size() {
- if (cacheArray == null) return 0; else return cacheArray.length;
- }
+ this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
+ if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
+ this.language = plasmaWordIndexEntry.language(url);
- public URL referrerURL() {
- if (requestHeader == null) return null;
- try {
- return new URL((String) requestHeader.get(httpHeader.REFERER, ""));
- } catch (Exception e) {
- return null;
- }
+ // to be defined later:
+ this.cacheArray = null;
+ }
+
+ public String name() {
+ return name;
+ }
+ public String initiator() {
+ return initiator;
+ }
+ public boolean proxy() {
+ return initiator() == null;
+ }
+ public long size() {
+ if (cacheArray == null) return 0; else return cacheArray.length;
+ }
+
+ public URL referrerURL() {
+ if (requestHeader == null) return null;
+ try {
+ return new URL((String) requestHeader.get(httpHeader.REFERER, ""));
+ } catch (Exception e) {
+ return null;
}
-
- /*
+ }
+
+ /*
public boolean update() {
return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD));
}
*/
- // the following three methods for cache read/write granting shall be as loose as possible
- // but also as strict as necessary to enable caching of most items
+ // the following three methods for cache read/write granting shall be as loose as possible
+ // but also as strict as necessary to enable caching of most items
- public String shallStoreCacheForProxy() {
+ public String shallStoreCacheForProxy() {
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 5887635b9..d1056d393 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -100,25 +100,25 @@
package de.anomic.plasma;
-import java.io.BufferedReader;
+// import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
-import java.io.FileInputStream;
+// import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
+// import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
-import java.util.Enumeration;
+// import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
-import java.util.TreeMap;
-import java.util.Vector;
+// import java.util.TreeMap;
+// import java.util.Vector;
import de.anomic.data.messageBoard;
import de.anomic.data.wikiBoard;
@@ -130,24 +130,23 @@ import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
-import de.anomic.server.serverDate;
+// import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
-import de.anomic.server.serverFileUtils;
+// import de.anomic.server.serverFileUtils;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeed;
-import de.anomic.yacy.yacySeedDB;
+// import de.anomic.yacy.yacySeedDB;
public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch {
-
// load slots
public static int crawlSlots = 10;
public static int indexingSlots = 100;
@@ -158,7 +157,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static plasmaURLPattern urlBlacklist;
// storage management
- private File cachePath;
+ private File cachePath; // do we need that ?
private File plasmaPath;
public File listsPath;
public plasmaURLPool urlPool;
@@ -190,15 +189,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private Object crawlingPausedSync = new Object();
private boolean crawlingIsPaused = false;
private static plasmaSwitchboard sb;
-
+
public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException {
- super(rootPath, initPath, configPath);
-
+ super(rootPath, initPath, configPath);
+
// set loglevel and log
- setLog(new serverLog("PLASMA"));
-
- // load values from configs
- plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB"));
+ setLog(new serverLog("PLASMA"));
+
+ // load values from configs
+ plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB"));
listsPath = new File(rootPath, getConfig("listsPath", "LISTS"));
remoteProxyHost = getConfig("remoteProxyHost", "");
try {
@@ -216,14 +215,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
proxyLastAccess = System.currentTimeMillis() - 60000;
if (!(listsPath.exists())) listsPath.mkdirs();
-
- // load coloured lists
- if (blueList == null) {
- // read only once upon first instantiation of this class
- String f = getConfig("plasmaBlueList", null);
- if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet();
- }
-
+
+ // load coloured lists
+ if (blueList == null) {
+ // read only once upon first instantiation of this class
+ String f = getConfig("plasmaBlueList", null);
+ if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet();
+ }
+
// load the black-list / inspired by [AS]
urlBlacklist = new plasmaURLPattern(new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")));
String f = getConfig("proxyBlackListsActive", null);
@@ -237,8 +236,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (stopwords == null) {
stopwords = kelondroMSetTools.loadList(new File(rootPath, "yacy.stopwords"));
}
-
- // read memory amount
+
+ // read memory amount
int ramLURL = Integer.parseInt(getConfig("ramCacheLURL", "1024")) / 1024;
int ramNURL = Integer.parseInt(getConfig("ramCacheNURL", "1024")) / 1024;
int ramEURL = Integer.parseInt(getConfig("ramCacheEURL", "1024")) / 1024;
@@ -254,7 +253,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSystem("Message Cache memory = " + ppRamString(ramMessage));
log.logSystem("Wiki Cache memory = " + ppRamString(ramWiki));
- // make crawl profiles database and default profiles
+ // make crawl profiles database and default profiles
log.logSystem("Initializing Crawl Profiles");
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
initProfiles();
@@ -270,7 +269,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a cache manager
log.logSystem("Starting HT Cache Manager");
- File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE"));
+
+ // create the Cache directorie - Borg-0300
+ String cp = getConfig("proxyCache", "DATA/HTCACHE");
+ cp = cp.replace('\\', '/');
+ if (cp.endsWith("/")) cp = cp.substring(0,cp.length() - 1);
+ File htCachePath = new File(cp);
+ if (!(htCachePath.exists())) htCachePath.mkdirs();
+ if (!(htCachePath.isDirectory())) {
+ // if the cache does not exists or is a file and not a directory, panic
+ serverLog.logSystem("PLASMA", "the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
+ System.exit(0);
+ } else {
+ serverLog.logInfo("PLASMA", "proxyCache=" + cp);
+ }
+
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP);
@@ -309,7 +322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
messageDB = new messageBoard(new File(getRootPath(), "DATA/SETTINGS/message.db"), ramMessage);
log.logSystem("Starting Wiki Board");
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
- new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
+ new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
// init cookie-Monitor
log.logSystem("Starting Cookie Monitor");