*) proxyCache, proxyCacheSize can be changed under 'Proxy Indexing'

- path now are absolute
*) move path check from plasmaHTCache to plasmaSwitchboard
   - only one path check when starting
*) small other

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@606 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 20 years ago
parent 3dfda1c9da
commit bf14e6def5

@ -50,7 +50,6 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
@ -70,7 +69,6 @@ public class CacheAdmin_p {
return SimpleFormatter.format(date);
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
@ -78,7 +76,10 @@ public class CacheAdmin_p {
String action = ((post == null) ? "info" : post.get("action", "info"));
String pathString = ((post == null) ? "" : post.get("path", "/"));
String fileString = pathString;
File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
File file = new File(cache, pathString);
File dir;
URL url = plasmaHTCache.getURL(cache, file);
@ -89,6 +90,7 @@ public class CacheAdmin_p {
dir = file.getParentFile();
pathString = (new File(pathString)).getParent().replace('\\','/');
}
// generate dir listing
String[] list = dir.list();
File f; String tree = "Directory of<br>" + ((pathString.length() == 0) ? "domain list" : linkPathString(pathString)) + "<br><br>";
@ -97,6 +99,7 @@ public class CacheAdmin_p {
else {
for (int i = 0; i < list.length; i++) {
f = new File(dir, list[i]);
if (!f.getName().equalsIgnoreCase("responseHeader.db"))
if (f.isDirectory())
tree += "<img src=\"/env/grafics/folderIconSmall.gif\" align=\"top\" alt=\"Folder\">&nbsp;<a href=\"CacheAdmin_p.html?action=info&path=" + pathString + "/" + list[i] + "\" class=\"tt\">" + list[i] + "</a><br>" + serverCore.crlfString;
else
@ -141,7 +144,6 @@ public class CacheAdmin_p {
}
}
//
prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("tree", tree);
@ -193,5 +195,4 @@ public class CacheAdmin_p {
}
return result;
}
}

@ -41,7 +41,7 @@
// You must compile this file with
// javac -classpath .:../Classes Message.java
// javac -classpath .:../classes CacheResource_p.java
// if the shell's current path is HTROOT
import java.io.File;
@ -60,7 +60,10 @@ public class CacheResource_p {
serverObjects prop = new serverObjects();
String path = ((post == null) ? "" : post.get("path", ""));
File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
File f = new File(cache, path);
byte[] resource;
@ -72,5 +75,4 @@ public class CacheResource_p {
}
return prop;
}
}

@ -18,12 +18,16 @@ and automatically excluded from indexing.
</p>
<p><form action="ProxyIndexingMonitor_p.html" method="post" enctype="multipart/form-data">
<div class=small><b>Proxy pre-fetch setting:</b>
this is an automated html page loading procedure that takes actual proxy-requested
URLs as crawling start points for crawling.</div>
<table border="0" cellpadding="5" cellspacing="0" width="100%">
<tr class="TableCellLight">
<td colspan="3"><div class=small><b>Proxy pre-fetch setting:</b>
this is an automated html page loading procedure that takes actual proxy-requested
URLs as crawling start points for crawling.</div></td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Prefetch Depth:</td>
<td class=small>Prefetch Depth</td>
<td class=small><input name="proxyPrefetchDepth" type="text" size="2" maxlength="2" value="#[proxyPrefetchDepth]#"></td>
<td class=small>
A prefetch of 0 means no prefetch; a prefetch of 1 means to prefetch all
@ -31,29 +35,52 @@ URLs as crawling start points for crawling.</div>
this means that only embedded href-anchors are prefetched additionally.</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Store to Cache:</td>
<td class=small>Store to Cache</td>
<td class=small><input type="checkbox" name="proxyStoreHTCache" align="top" #(proxyStoreHTCacheChecked)#::checked#(/proxyStoreHTCacheChecked)#></td>
<td class=small>It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.</td>
</tr>
<tr class="TableCellLight">
<td colspan="3"><div class=small><b>Proxy generally</b></div></td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Path</td>
<td class=small><input name="proxyCache" type="text" size="20" maxlength="80" value="#[proxyCache]#"></td>
<td class=small>The path where the pages are stored (max. length 80)</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Size</td>
<td class=small><input name="proxyCacheSize" type="text" size="8" maxlength="24" value="#[proxyCacheSize]#"></td>
<td class=small>The size in MB of the cache.</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small></td>
<td class=small></td>
<td class=small><input type="submit" name="proxyprofileset" value="set proxy profile"></td>
<td class=small colspan="1">&nbsp;</td>
<td class=small colspan="2"><input type="submit" name="proxyprofileset" value="set proxy profile"></td>
</tr>
</table>
</form></p>
<p>
#(info)#
<!-- info 0 -->
::
<br><b>The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
<!-- info 1 -->
<b>The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
Please delete that file and restart.</b><br>
::
<br>
<b>Proxy pre-fetch is now set to depth-#[message]#.</b><br>
<b>Proxy caching is now set #(caching)#off::on#(/caching)#.</b><br>
<!-- info 2 -->
<b>Pre-fetch is now set to depth-#[message]#.</b><br>
<b>Caching is now #(caching)#off::on#(/caching)#.</b><br>
#(path)#::<b>Cachepath is now set to '#[return]#'.</b> Please move the old data in the new directory.<br>#(/path)#
#(size)#::<b>Cachesize is now set to #[return]#MB.</b><br>#(/size)#
#(restart)#::<br><font color="red"><b>Changes will take effect after restart only.</b></font><br>#(/restart)#
::
<br><b>An error has occurred: #[error]#.</b><br>
<!-- info 3 -->
<b>An error has occurred: #[error]#.</b><br>
#(/info)#
</p>
<p>You can see a snapshot of recently indexed pages
on the <a href="/IndexMonitor.html?process=4">Proxy Index Monitor</a> Page.

@ -40,13 +40,14 @@
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../Classes Settings_p.java
// javac -classpath .:../classes ProxyIndexingMonitor_p.java
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
// import java.text.SimpleDateFormat;
// import java.util.Date;
// import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile;
@ -54,22 +55,24 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
public class ProxyIndexingMonitor_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) return ""; else return dayFormatter.format(date);
}
// private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
// private static String daydate(Date date) {
// if (date == null) return ""; else return dayFormatter.format(date);
// }
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
int showIndexedCount = 20;
boolean se = false;
// int showIndexedCount = 20;
// boolean se = false;
String oldProxyCache, newProxyCache;
String oldProxyCacheSize, newProxyCacheSize;
prop.put("info", 0);
prop.put("info_message", "");
@ -83,10 +86,25 @@ public class ProxyIndexingMonitor_p {
boolean proxyStoreHTCache = ((String) post.get("proxyStoreHTCache", "")).equals("on");
env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false");
// added proxyCache, proxyCacheSize - Borg-0300
// proxyCache - check and create the directory
oldProxyCache = env.getConfig("proxyCache", "DATA/HTCACHE");
newProxyCache = ((String) post.get("proxyCache", "DATA/HTCACHE"));
newProxyCache = newProxyCache.replace("\\", "/");
if (newProxyCache.endsWith("/")) newProxyCache.substring(0, newProxyCache.length() - 1);
File cp = new File(newProxyCache);
if ((!cp.isDirectory()) && (!cp.isFile())) cp.mkdirs();
env.setConfig("proxyCache", newProxyCache);
// proxyCacheSize
oldProxyCacheSize = Integer.toString(Integer.parseInt(env.getConfig("proxyCacheSize", "64")));
newProxyCacheSize = Integer.toString(Integer.parseInt((String) post.get("proxyCacheSize", "64")));
env.setConfig("proxyCacheSize", newProxyCacheSize);
// implant these settings also into the crawling profile for the proxy
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(switchboard.getConfig("defaultProxyProfile", ""));
if (profile == null) {
prop.put("info", 1);//delete DATA/PLASMADB/crawlProfiles0.db
prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
@ -94,6 +112,29 @@ public class ProxyIndexingMonitor_p {
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0);
// proxyCache - only display on change
if (oldProxyCache.compareTo(newProxyCache) == 0) {
prop.put("info_path", 0);
prop.put("info_path_return", oldProxyCache);
} else {
prop.put("info_path", 1);
prop.put("info_path_return", newProxyCache);
}
// proxyCacheSize - only display on change
if (oldProxyCacheSize.compareTo(newProxyCacheSize) == 0) {
prop.put("info_size", 0);
prop.put("info_size_return", oldProxyCacheSize);
} else {
prop.put("info_size", 1);
prop.put("info_size_return", newProxyCacheSize);
}
// proxyCache, proxyCacheSize we need a restart
prop.put("info_restart", 0);
prop.put("info_restart_return", 0);
if (oldProxyCache.compareTo(newProxyCache) != 0) prop.put("info_restart", 1);
if (oldProxyCacheSize.compareTo(newProxyCacheSize) != 0) prop.put("info_restart", 1);
} catch (IOException e) {
prop.put("info", 3); //Error: errmsg
prop.put("info_error", e.getMessage());
@ -109,8 +150,9 @@ public class ProxyIndexingMonitor_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE"));
prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64"));
// return rewrite properties
return prop;
}
}

@ -113,8 +113,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
int p;
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
p = us.indexOf(":80/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
return us;
}

@ -59,9 +59,9 @@ import java.util.Date;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.TimeZone;
//import java.util.Calendar;
//import java.util.GregorianCalendar;
//import java.util.TimeZone;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
@ -89,13 +89,14 @@ public final class plasmaHTCache {
public static serverLog log;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) {
//this.switchboard = switchboard;
// this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
this.maxCacheSize = maxCacheSize;
// set cache path
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
/* // set cache path
if (!(htCachePath.exists())) {
// make the cache path
htCachePath.mkdir();
@ -104,7 +105,7 @@ public final class plasmaHTCache {
// if the cache does not exists or is a file and not a directory, panic
System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
}
}*/
// open the response header database
File dbfile = new File(cachePath, "responseHeader.db");
@ -200,13 +201,13 @@ public final class plasmaHTCache {
while ((currCacheSize > maxCacheSize) && (cacheAge.size() > 0)) {
f = (File) cacheAge.remove(cacheAge.firstKey());
if ((f != null) && (f.exists())) {
currCacheSize -= f.length();
long size = f.length();
//currCacheSize -= f.length();
if (f.delete()) {
log.logInfo("DELETED OLD CACHE : " + f.toString());
currCacheSize -= size;
f = f.getParentFile();
if ((f.exists()) && (f.isDirectory())) {
// check size of directory
if (f.list().length == 0) {
if (f.isDirectory() && (f.list().length == 0)) {
// the directory has no files in it; delete it also
if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
@ -214,7 +215,6 @@ public final class plasmaHTCache {
}
}
}
}
public void close() throws IOException {
responseHeaderDB.close();
@ -256,8 +256,7 @@ public final class plasmaHTCache {
}
log.logSystem("CACHE SCANNED, CONTAINS " + c +
" FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) +
" OLD");
((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD");
cleanup();
// start to prefetch ip's from dns
@ -343,15 +342,18 @@ public final class plasmaHTCache {
return plasmaParser.mediaExtContains(urlString);
}
// this method creates from a given host and path a cache path
/**
* this method creates from a given host and path a cache path
* from a given host (which may also be an IPv4 - number, but not IPv6 or
* a domain; all without leading 'http://') and a path (which must start
* with a leading '/', and may also end in an '/') a path to a file
* in the file system with root as given in cachePath is constructed
* it will also be ensured, that the complete path exists; if necessary
* that path will be generated
* @return URL
*/
public File getCachePath(URL url) {
// from a given host (which may also be an IPv4 - number, but not IPv6 or
// a domain; all without leading 'http://') and a path (which must start
// with a leading '/', and may also end in an '/') a path to a file
// in the file system with root as given in cachePath is constructed
// it will also be ensured, that the complete path exists; if necessary
// that path will be generated
//System.out.println("DEBUG: getCachedPath=" + url.toString());
// System.out.println("DEBUG: getCachePath: IN=" + url.toString());
String remotePath = url.getPath();
if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
@ -361,31 +363,40 @@ public final class plasmaHTCache {
remotePath = remotePath.replace(':', '_'); // yes this is not reversible, but that is not needed
int port = url.getPort();
if (port < 0) port = 80;
// System.out.println("DEBUG: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
}
/**
* this is the reverse function to getCachePath: it constructs the url as string
* from a given storage path
*/
public static URL getURL(File cachePath, File f) {
// this is the reverse function to getCachePath: it constructs the url as string
// from a given storage path
// System.out.println("DEBUG: getURL: IN: Path=[" + cachePath + "]");
// System.out.println("DEBUG: getURL: IN: File=[" + f + "]");
String s = f.toString().replace('\\', '/');
String c = cachePath.toString().replace('\\', '/');
//System.out.println("DEBUG: getURL for c=" + c + ", s=" + s);
int p = s.lastIndexOf(c);
if (p >= 0) {
s = s.substring(p + c.length());
while (s.startsWith("/")) s = s.substring(1);
if ((p = s.indexOf("+")) >= 0) {
s = s.substring(0, p) + ":" + s.substring(p + 1);
} else {
/* } else {
p = s.indexOf("/");
if (p < 0)
s = s + ":80/";
else
s = s.substring(0, p) + ":80" + s.substring(p);
s = s.substring(0, p) + ":80" + s.substring(p);*/
}
if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3);
//System.out.println("DEBUG: getURL url=" + s);
// System.out.println("DEBUG: getURL: OUT=" + s);
try {
/* URL url = null;
url = new URL("http://" + s);
System.out.println("DEBUG: getURL: URL=" + url.toString());
return url;//new URL("http://" + s); */
return new URL("http://" + s);
} catch (Exception e) {
return null;
@ -449,14 +460,14 @@ public final class plasmaHTCache {
public plasmaCrawlProfile.entry profile;
private String initiator;
public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
// normalize url
// normalize url - Borg-0300
serverLog.logDebug("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
try {
this.url = new URL(nomalizedURLString);

@ -100,25 +100,25 @@
package de.anomic.plasma;
import java.io.BufferedReader;
// import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
// import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
// import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Enumeration;
// import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.TreeMap;
import java.util.Vector;
// import java.util.TreeMap;
// import java.util.Vector;
import de.anomic.data.messageBoard;
import de.anomic.data.wikiBoard;
@ -130,24 +130,23 @@ import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
// import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverFileUtils;
// import de.anomic.server.serverFileUtils;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
// import de.anomic.yacy.yacySeedDB;
public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch {
// load slots
public static int crawlSlots = 10;
public static int indexingSlots = 100;
@ -158,7 +157,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static plasmaURLPattern urlBlacklist;
// storage management
private File cachePath;
private File cachePath; // do we need that ?
private File plasmaPath;
public File listsPath;
public plasmaURLPool urlPool;
@ -270,7 +269,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a cache manager
log.logSystem("Starting HT Cache Manager");
File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE"));
// create the Cache directorie - Borg-0300
String cp = getConfig("proxyCache", "DATA/HTCACHE");
cp = cp.replace('\\', '/');
if (cp.endsWith("/")) cp = cp.substring(0,cp.length() - 1);
File htCachePath = new File(cp);
if (!(htCachePath.exists())) htCachePath.mkdirs();
if (!(htCachePath.isDirectory())) {
// if the cache does not exists or is a file and not a directory, panic
serverLog.logSystem("PLASMA", "the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
} else {
serverLog.logInfo("PLASMA", "proxyCache=" + cp);
}
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP);

Loading…
Cancel
Save