*) proxyCache, proxyCacheSize can be changed under 'Proxy Indexing'

- path now are absolute
*) move path check from plasmaHTCache to plasmaSwitchboard
   - only one path check when starting
*) small other

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@606 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 20 years ago
parent 3dfda1c9da
commit bf14e6def5

@ -50,7 +50,6 @@ import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
@ -67,18 +66,20 @@ public class CacheAdmin_p {
private static SimpleDateFormat SimpleFormatter = new SimpleDateFormat("yyyy/MM/dd HH:mm:ss");
public static String dateString(Date date) {
return SimpleFormatter.format(date);
return SimpleFormatter.format(date);
}
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
String action = ((post == null) ? "info" : post.get("action", "info"));
String pathString = ((post == null) ? "" : post.get("path", "/"));
String pathString = ((post == null) ? "" : post.get("path", "/"));
String fileString = pathString;
File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
File file = new File(cache, pathString);
File dir;
URL url = plasmaHTCache.getURL(cache, file);
@ -89,6 +90,7 @@ public class CacheAdmin_p {
dir = file.getParentFile();
pathString = (new File(pathString)).getParent().replace('\\','/');
}
// generate dir listing
String[] list = dir.list();
File f; String tree = "Directory of<br>" + ((pathString.length() == 0) ? "domain list" : linkPathString(pathString)) + "<br><br>";
@ -97,10 +99,11 @@ public class CacheAdmin_p {
else {
for (int i = 0; i < list.length; i++) {
f = new File(dir, list[i]);
if (f.isDirectory())
tree += "<img src=\"/env/grafics/folderIconSmall.gif\" align=\"top\" alt=\"Folder\">&nbsp;<a href=\"CacheAdmin_p.html?action=info&path=" + pathString + "/" + list[i] + "\" class=\"tt\">" + list[i] + "</a><br>" + serverCore.crlfString;
else
tree += "<img src=\"/env/grafics/fileIconSmall.gif\" align=\"top\" alt=\"File\">&nbsp;<a href=\"CacheAdmin_p.html?action=info&path=" + pathString + "/" + list[i] + "\" class=\"tt\">" + list[i] + "</a><br>" + serverCore.crlfString;
if (!f.getName().equalsIgnoreCase("responseHeader.db"))
if (f.isDirectory())
tree += "<img src=\"/env/grafics/folderIconSmall.gif\" align=\"top\" alt=\"Folder\">&nbsp;<a href=\"CacheAdmin_p.html?action=info&path=" + pathString + "/" + list[i] + "\" class=\"tt\">" + list[i] + "</a><br>" + serverCore.crlfString;
else
tree += "<img src=\"/env/grafics/fileIconSmall.gif\" align=\"top\" alt=\"File\">&nbsp;<a href=\"CacheAdmin_p.html?action=info&path=" + pathString + "/" + list[i] + "\" class=\"tt\">" + list[i] + "</a><br>" + serverCore.crlfString;
}
}
@ -108,7 +111,7 @@ public class CacheAdmin_p {
if (action.equals("info")) {
if (!(file.isDirectory())) {
String urls = htmlFilterContentScraper.urlNormalform(url);
String urls = htmlFilterContentScraper.urlNormalform(url);
info += "<b>Info for URL <a href=\"" + urls + "\">" + urls + "</a>:</b><br><br>";
try {
httpHeader fileheader = switchboard.cacheManager.getCachedResponse(plasmaURL.urlHash(url));
@ -141,13 +144,12 @@ public class CacheAdmin_p {
}
}
//
prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("cachesize", Long.toString(switchboard.cacheManager.currCacheSize/1024));
prop.put("cachemax", Long.toString(switchboard.cacheManager.maxCacheSize/1024));
prop.put("tree", tree);
prop.put("info", info);
// return rewrite properties
return prop;
return prop;
}
private static String formatHeader(httpHeader header) {
@ -193,5 +195,4 @@ public class CacheAdmin_p {
}
return result;
}
}

@ -41,7 +41,7 @@
// You must compile this file with
// javac -classpath .:../Classes Message.java
// javac -classpath .:../classes CacheResource_p.java
// if the shell's current path is HTROOT
import java.io.File;
@ -56,11 +56,14 @@ import de.anomic.server.serverSwitch;
public class CacheResource_p {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
String path = ((post == null) ? "" : post.get("path", ""));
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
File cache = new File(switchboard.getConfig("proxyCache", "DATA/HTCACHE").toString());
String path = ((post == null) ? "" : post.get("path", ""));
File cache = new File(switchboard.getRootPath(), switchboard.getConfig("proxyCache", "DATA/HTCACHE"));
File f = new File(cache, path);
byte[] resource;
@ -70,7 +73,6 @@ public class CacheResource_p {
} catch (IOException e) {
prop.put("resource", new byte[0]);
}
return prop;
return prop;
}
}

@ -18,12 +18,16 @@ and automatically excluded from indexing.
</p>
<p><form action="ProxyIndexingMonitor_p.html" method="post" enctype="multipart/form-data">
<div class=small><b>Proxy pre-fetch setting:</b>
this is an automated html page loading procedure that takes actual proxy-requested
URLs as crawling start points for crawling.</div>
<table border="0" cellpadding="5" cellspacing="0" width="100%">
<tr class="TableCellLight">
<td colspan="3"><div class=small><b>Proxy pre-fetch setting:</b>
this is an automated html page loading procedure that takes actual proxy-requested
URLs as crawling start points for crawling.</div></td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Prefetch Depth:</td>
<td class=small>Prefetch Depth</td>
<td class=small><input name="proxyPrefetchDepth" type="text" size="2" maxlength="2" value="#[proxyPrefetchDepth]#"></td>
<td class=small>
A prefetch of 0 means no prefetch; a prefetch of 1 means to prefetch all
@ -31,29 +35,52 @@ URLs as crawling start points for crawling.</div>
this means that only embedded href-anchors are prefetched additionally.</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Store to Cache:</td>
<td class=small>Store to Cache</td>
<td class=small><input type="checkbox" name="proxyStoreHTCache" align="top" #(proxyStoreHTCacheChecked)#::checked#(/proxyStoreHTCacheChecked)#></td>
<td class=small>It is almost always recommended to set this on. The only exception is that you have another caching proxy running as secondary proxy and YaCy is configured to used that proxy in proxy-proxy - mode.</td>
</tr>
<tr class="TableCellLight">
<td colspan="3"><div class=small><b>Proxy generally</b></div></td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Path</td>
<td class=small><input name="proxyCache" type="text" size="20" maxlength="80" value="#[proxyCache]#"></td>
<td class=small>The path where the pages are stored (max. length 80)</td>
</tr>
<tr valign="top" class="TableCellDark">
<td class=small>Size</td>
<td class=small><input name="proxyCacheSize" type="text" size="8" maxlength="24" value="#[proxyCacheSize]#"></td>
<td class=small>The size in MB of the cache.</td>
</tr>
<tr valign="top" class="TableCellLight">
<td class=small></td>
<td class=small></td>
<td class=small><input type="submit" name="proxyprofileset" value="set proxy profile"></td>
<td class=small colspan="1">&nbsp;</td>
<td class=small colspan="2"><input type="submit" name="proxyprofileset" value="set proxy profile"></td>
</tr>
</table>
</form></p>
<p>
#(info)#
<!-- info 0 -->
::
<br><b>The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
<!-- info 1 -->
<b>The file DATA/PLASMADB/crawlProfiles0.db is missing or corrupted.
Please delete that file and restart.</b><br>
::
<br>
<b>Proxy pre-fetch is now set to depth-#[message]#.</b><br>
<b>Proxy caching is now set #(caching)#off::on#(/caching)#.</b><br>
<!-- info 2 -->
<b>Pre-fetch is now set to depth-#[message]#.</b><br>
<b>Caching is now #(caching)#off::on#(/caching)#.</b><br>
#(path)#::<b>Cachepath is now set to '#[return]#'.</b> Please move the old data in the new directory.<br>#(/path)#
#(size)#::<b>Cachesize is now set to #[return]#MB.</b><br>#(/size)#
#(restart)#::<br><font color="red"><b>Changes will take effect after restart only.</b></font><br>#(/restart)#
::
<br><b>An error has occurred: #[error]#.</b><br>
<!-- info 3 -->
<b>An error has occurred: #[error]#.</b><br>
#(/info)#
</p>
<p>You can see a snapshot of recently indexed pages
on the <a href="/IndexMonitor.html?process=4">Proxy Index Monitor</a> Page.

@ -40,13 +40,14 @@
// Contributions and changes to the program code must be marked as such.
// You must compile this file with
// javac -classpath .:../Classes Settings_p.java
// javac -classpath .:../classes ProxyIndexingMonitor_p.java
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
// import java.text.SimpleDateFormat;
// import java.util.Date;
// import java.util.Locale;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlProfile;
@ -54,22 +55,24 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
public class ProxyIndexingMonitor_p {
private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
private static String daydate(Date date) {
if (date == null) return ""; else return dayFormatter.format(date);
}
// private static SimpleDateFormat dayFormatter = new SimpleDateFormat("yyyy/MM/dd", Locale.US);
// private static String daydate(Date date) {
// if (date == null) return ""; else return dayFormatter.format(date);
// }
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
// return variable that accumulates replacements
// return variable that accumulates replacements
plasmaSwitchboard switchboard = (plasmaSwitchboard) env;
serverObjects prop = new serverObjects();
serverObjects prop = new serverObjects();
// int showIndexedCount = 20;
// boolean se = false;
int showIndexedCount = 20;
boolean se = false;
String oldProxyCache, newProxyCache;
String oldProxyCacheSize, newProxyCacheSize;
prop.put("info", 0);
prop.put("info_message", "");
@ -83,20 +86,58 @@ public class ProxyIndexingMonitor_p {
boolean proxyStoreHTCache = ((String) post.get("proxyStoreHTCache", "")).equals("on");
env.setConfig("proxyStoreHTCache", (proxyStoreHTCache) ? "true" : "false");
// added proxyCache, proxyCacheSize - Borg-0300
// proxyCache - check and create the directory
oldProxyCache = env.getConfig("proxyCache", "DATA/HTCACHE");
newProxyCache = ((String) post.get("proxyCache", "DATA/HTCACHE"));
newProxyCache = newProxyCache.replace("\\", "/");
if (newProxyCache.endsWith("/")) newProxyCache.substring(0, newProxyCache.length() - 1);
File cp = new File(newProxyCache);
if ((!cp.isDirectory()) && (!cp.isFile())) cp.mkdirs();
env.setConfig("proxyCache", newProxyCache);
// proxyCacheSize
oldProxyCacheSize = Integer.toString(Integer.parseInt(env.getConfig("proxyCacheSize", "64")));
newProxyCacheSize = Integer.toString(Integer.parseInt((String) post.get("proxyCacheSize", "64")));
env.setConfig("proxyCacheSize", newProxyCacheSize);
// implant these settings also into the crawling profile for the proxy
plasmaCrawlProfile.entry profile = switchboard.profiles.getEntry(switchboard.getConfig("defaultProxyProfile", ""));
if (profile == null) {
prop.put("info", 1);//delete DATA/PLASMADB/crawlProfiles0.db
prop.put("info", 1); //delete DATA/PLASMADB/crawlProfiles0.db
} else {
try {
profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
profile.changeEntry("generalDepth", Integer.toString(newProxyPrefetchDepth));
profile.changeEntry("storeHTCache", (proxyStoreHTCache) ? "true": "false");
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info", 2);//new proxyPrefetchdepth
prop.put("info_message", newProxyPrefetchDepth);
prop.put("info_caching", (proxyStoreHTCache) ? 1 : 0);
// proxyCache - only display on change
if (oldProxyCache.compareTo(newProxyCache) == 0) {
prop.put("info_path", 0);
prop.put("info_path_return", oldProxyCache);
} else {
prop.put("info_path", 1);
prop.put("info_path_return", newProxyCache);
}
// proxyCacheSize - only display on change
if (oldProxyCacheSize.compareTo(newProxyCacheSize) == 0) {
prop.put("info_size", 0);
prop.put("info_size_return", oldProxyCacheSize);
} else {
prop.put("info_size", 1);
prop.put("info_size_return", newProxyCacheSize);
}
// proxyCache, proxyCacheSize we need a restart
prop.put("info_restart", 0);
prop.put("info_restart_return", 0);
if (oldProxyCache.compareTo(newProxyCache) != 0) prop.put("info_restart", 1);
if (oldProxyCacheSize.compareTo(newProxyCacheSize) != 0) prop.put("info_restart", 1);
} catch (IOException e) {
prop.put("info", 3); //Error: errmsg
prop.put("info_error", e.getMessage());
prop.put("info", 3); //Error: errmsg
prop.put("info_error", e.getMessage());
}
}
@ -109,8 +150,9 @@ public class ProxyIndexingMonitor_p {
prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0"));
prop.put("proxyStoreHTCacheChecked", env.getConfig("proxyStoreHTCache", "").equals("true") ? 1 : 0);
prop.put("proxyCache", env.getConfig("proxyCache", "DATA/HTCACHE"));
prop.put("proxyCacheSize", env.getConfig("proxyCacheSize", "64"));
// return rewrite properties
return prop;
return prop;
}
}

@ -113,8 +113,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
int p;
if ((p = us.indexOf("#")) >= 0) us = us.substring(0, p);
if (us.endsWith(":80")) us = us.substring(0, us.length() - 3);
p = us.indexOf(":80/");
if (p >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if ((p = us.indexOf(":80/")) >= 0) us = us.substring(0,p).concat(us.substring(p + 3));
if (((us.endsWith("/")) && (us.lastIndexOf('/', us.length() - 2) < 8))) us = us.substring(0, us.length() - 1);
return us;
}

@ -59,9 +59,9 @@ import java.util.Date;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import java.util.Calendar;
import java.util.GregorianCalendar;
import java.util.TimeZone;
//import java.util.Calendar;
//import java.util.GregorianCalendar;
//import java.util.TimeZone;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
@ -89,46 +89,47 @@ public final class plasmaHTCache {
public static serverLog log;
public plasmaHTCache(File htCachePath, long maxCacheSize, int bufferkb) {
//this.switchboard = switchboard;
// this.switchboard = switchboard;
this.log = new serverLog("HTCACHE");
this.cachePath = htCachePath;
this.maxCacheSize = maxCacheSize;
// set cache path
if (!(htCachePath.exists())) {
// make the cache path
htCachePath.mkdir();
}
if (!(htCachePath.isDirectory())) {
// if the cache does not exists or is a file and not a directory, panic
System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
}
// open the response header database
File dbfile = new File(cachePath, "responseHeader.db");
try {
// we dont need check the path, because we have do that in plasmaSwitchboard.java - Borg-0300
/* // set cache path
if (!(htCachePath.exists())) {
// make the cache path
htCachePath.mkdir();
}
if (!(htCachePath.isDirectory())) {
// if the cache does not exists or is a file and not a directory, panic
System.out.println("the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
}*/
// open the response header database
File dbfile = new File(cachePath, "responseHeader.db");
try {
if (dbfile.exists())
responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400));
else
responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150));
} catch (IOException e) {
System.out.println("the request header database could not be opened: " + e.getMessage());
System.exit(0);
}
responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400));
else
responseHeaderDB = new kelondroMap(new kelondroDyn(dbfile, bufferkb * 0x400, plasmaCrawlLURL.urlHashLength, 150));
} catch (IOException e) {
System.out.println("the request header database could not be opened: " + e.getMessage());
System.exit(0);
}
// init stack
cacheStack = new LinkedList();
// init stack
cacheStack = new LinkedList();
// init cache age and size management
cacheAge = new TreeMap();
currCacheSize = 0;
this.maxCacheSize = maxCacheSize;
cacheAge = new TreeMap();
currCacheSize = 0;
this.maxCacheSize = maxCacheSize;
// start the cache startup thread
// this will collect information about the current cache size and elements
serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000);
// start the cache startup thread
// this will collect information about the current cache size and elements
serverInstantThread.oneTimeJob(this, "cacheScan", log, 5000);
}
public int size() {
@ -200,16 +201,15 @@ public final class plasmaHTCache {
while ((currCacheSize > maxCacheSize) && (cacheAge.size() > 0)) {
f = (File) cacheAge.remove(cacheAge.firstKey());
if ((f != null) && (f.exists())) {
currCacheSize -= f.length();
long size = f.length();
//currCacheSize -= f.length();
if (f.delete()) {
log.logInfo("DELETED OLD CACHE : " + f.toString());
currCacheSize -= size;
f = f.getParentFile();
if ((f.exists()) && (f.isDirectory())) {
// check size of directory
if (f.list().length == 0) {
// the directory has no files in it; delete it also
if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
if (f.isDirectory() && (f.list().length == 0)) {
// the directory has no files in it; delete it also
if (f.delete()) log.logInfo("DELETED EMPTY DIRECTORY : " + f.toString());
}
}
}
@ -221,72 +221,71 @@ public final class plasmaHTCache {
}
private String ageString(long date, File f) {
StringBuffer sb = new StringBuffer(32);
String s = Long.toHexString(date);
for (int i = s.length(); i < 16; i++) sb.append('0');
sb.append(s);
s = Integer.toHexString(f.hashCode());
for (int i = s.length(); i < 8; i++) sb.append('0');
sb.append(s);
return sb.toString();
}
public void cacheScan() {
//log.logSystem("STARTING CACHE SCANNING");
kelondroMScoreCluster doms = new kelondroMScoreCluster();
int c = 0;
enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true);
File f;
while (ef.hasMoreElements()) {
c++;
f = (File) ef.nextElement();
long d = f.lastModified();
//System.out.println("Cache: " + dom(f));
doms.incScore(dom(f));
currCacheSize += f.length();
cacheAge.put(ageString(d, f), f);
}
//System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey()));
long ageHours = 0;
try {
ageHours = (System.currentTimeMillis() -
Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000;
} catch (NumberFormatException e) {
//e.printStackTrace();
}
log.logSystem("CACHE SCANNED, CONTAINS " + c +
" FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) +
" OLD");
cleanup();
// start to prefetch ip's from dns
String dom;
long start = System.currentTimeMillis();
String ip, result = "";
c = 0;
while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) {
dom = (String) doms.getMaxObject();
ip = httpc.dnsResolve(dom);
if (ip == null) break;
result += ", " + dom + "=" + ip;
log.logSystem("PRE-FILLED " + dom + "=" + ip);
c++;
doms.deleteScore(dom);
// wait a short while to prevent that this looks like a DoS
try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {}
}
if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c +
" ADDRESSES: " + result.substring(2));
}
StringBuffer sb = new StringBuffer(32);
String s = Long.toHexString(date);
for (int i = s.length(); i < 16; i++) sb.append('0');
sb.append(s);
s = Integer.toHexString(f.hashCode());
for (int i = s.length(); i < 8; i++) sb.append('0');
sb.append(s);
return sb.toString();
}
private String dom(File f) {
String s = f.toString().substring(cachePath.toString().length() + 1);
int p = s.indexOf("/");
if (p < 0) p = s.indexOf("\\");
if (p < 0) return null;
return s.substring(0, p);
public void cacheScan() {
//log.logSystem("STARTING CACHE SCANNING");
kelondroMScoreCluster doms = new kelondroMScoreCluster();
int c = 0;
enumerateFiles ef = new enumerateFiles(cachePath, true, false, true, true);
File f;
while (ef.hasMoreElements()) {
c++;
f = (File) ef.nextElement();
long d = f.lastModified();
//System.out.println("Cache: " + dom(f));
doms.incScore(dom(f));
currCacheSize += f.length();
cacheAge.put(ageString(d, f), f);
}
//System.out.println("%" + (String) cacheAge.firstKey() + "=" + cacheAge.get(cacheAge.firstKey()));
long ageHours = 0;
try {
ageHours = (System.currentTimeMillis() -
Long.parseLong(((String) cacheAge.firstKey()).substring(0, 16), 16)) / 3600000;
} catch (NumberFormatException e) {
//e.printStackTrace();
}
log.logSystem("CACHE SCANNED, CONTAINS " + c +
" FILES = " + currCacheSize/1048576 + "MB, OLDEST IS " +
((ageHours < 24) ? (ageHours + " HOURS") : ((ageHours / 24) + " DAYS")) + " OLD");
cleanup();
// start to prefetch ip's from dns
String dom;
long start = System.currentTimeMillis();
String ip, result = "";
c = 0;
while ((doms.size() > 0) && (c < 50) && ((System.currentTimeMillis() - start) < 60000)) {
dom = (String) doms.getMaxObject();
ip = httpc.dnsResolve(dom);
if (ip == null) break;
result += ", " + dom + "=" + ip;
log.logSystem("PRE-FILLED " + dom + "=" + ip);
c++;
doms.deleteScore(dom);
// wait a short while to prevent that this looks like a DoS
try {Thread.currentThread().sleep(100);} catch (InterruptedException e) {}
}
if (result.length() > 2) log.logSystem("PRE-FILLED DNS CACHE, FETCHED " + c +
" ADDRESSES: " + result.substring(2));
}
private String dom(File f) {
String s = f.toString().substring(cachePath.toString().length() + 1);
int p = s.indexOf("/");
if (p < 0) p = s.indexOf("\\");
if (p < 0) return null;
return s.substring(0, p);
}
public httpHeader getCachedResponse(String urlHash) throws IOException {
Map hdb = responseHeaderDB.get(urlHash);
@ -295,11 +294,11 @@ public final class plasmaHTCache {
}
public boolean full() {
return (cacheStack.size() > stackLimit);
return (cacheStack.size() > stackLimit);
}
public boolean empty() {
return (cacheStack.size() == 0);
return (cacheStack.size() == 0);
}
public static boolean isPicture(httpHeader response) {
@ -343,55 +342,67 @@ public final class plasmaHTCache {
return plasmaParser.mediaExtContains(urlString);
}
// this method creates from a given host and path a cache path
/**
* this method creates from a given host and path a cache path
* from a given host (which may also be an IPv4 - number, but not IPv6 or
* a domain; all without leading 'http://') and a path (which must start
* with a leading '/', and may also end in an '/') a path to a file
* in the file system with root as given in cachePath is constructed
* it will also be ensured, that the complete path exists; if necessary
* that path will be generated
* @return URL
*/
public File getCachePath(URL url) {
// from a given host (which may also be an IPv4 - number, but not IPv6 or
// a domain; all without leading 'http://') and a path (which must start
// with a leading '/', and may also end in an '/') a path to a file
// in the file system with root as given in cachePath is constructed
// it will also be ensured, that the complete path exists; if necessary
// that path will be generated
//System.out.println("DEBUG: getCachedPath=" + url.toString());
String remotePath = url.getPath();
if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
// System.out.println("DEBUG: getCachePath: IN=" + url.toString());
String remotePath = url.getPath();
if (!(remotePath.startsWith("/"))) remotePath = "/" + remotePath;
if (remotePath.endsWith("/")) remotePath = remotePath + "ndx";
if (remotePath.indexOf('#') > 0) remotePath.substring(0, remotePath.indexOf('#'));
remotePath = remotePath.replace('?', '_');
remotePath = remotePath.replace('&', '_'); // yes this is not reversible, but that is not needed
remotePath = remotePath.replace(':', '_'); // yes this is not reversible, but that is not needed
int port = url.getPort();
if (port < 0) port = 80;
return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
int port = url.getPort();
if (port < 0) port = 80;
// System.out.println("DEBUG: getCachePath: OUT=" + url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
return new File(this.cachePath, url.getHost() + ((port == 80) ? "" : ("+" + port)) + remotePath);
}
/**
* this is the reverse function to getCachePath: it constructs the url as string
* from a given storage path
*/
public static URL getURL(File cachePath, File f) {
// this is the reverse function to getCachePath: it constructs the url as string
// from a given storage path
String s = f.toString().replace('\\', '/');
String c = cachePath.toString().replace('\\', '/');
//System.out.println("DEBUG: getURL for c=" + c + ", s=" + s);
int p = s.lastIndexOf(c);
if (p >= 0) {
s = s.substring(p + c.length());
while (s.startsWith("/")) s = s.substring(1);
if ((p = s.indexOf("+")) >= 0) {
// System.out.println("DEBUG: getURL: IN: Path=[" + cachePath + "]");
// System.out.println("DEBUG: getURL: IN: File=[" + f + "]");
String s = f.toString().replace('\\', '/');
String c = cachePath.toString().replace('\\', '/');
int p = s.lastIndexOf(c);
if (p >= 0) {
s = s.substring(p + c.length());
while (s.startsWith("/")) s = s.substring(1);
if ((p = s.indexOf("+")) >= 0) {
s = s.substring(0, p) + ":" + s.substring(p + 1);
} else {
/* } else {
p = s.indexOf("/");
if (p < 0)
s = s + ":80/";
else
s = s.substring(0, p) + ":80" + s.substring(p);
s = s.substring(0, p) + ":80" + s.substring(p);*/
}
if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3);
//System.out.println("DEBUG: getURL url=" + s);
if (s.endsWith("ndx")) s = s.substring(0, s.length() - 3);
// System.out.println("DEBUG: getURL: OUT=" + s);
try {
/* URL url = null;
url = new URL("http://" + s);
System.out.println("DEBUG: getURL: URL=" + url.toString());
return url;//new URL("http://" + s); */
return new URL("http://" + s);
} catch (Exception e) {
return null;
}
}
return null;
}
return null;
}
public byte[] loadResource(URL url) {
@ -407,8 +418,8 @@ public final class plasmaHTCache {
}
public static boolean isPOST(String urlString) {
return ((urlString.indexOf("?") >= 0) ||
(urlString.indexOf("&") >= 0));
return ((urlString.indexOf("?") >= 0) ||
(urlString.indexOf("&") >= 0));
}
public static boolean isCGI(String urlString) {
@ -421,8 +432,8 @@ public final class plasmaHTCache {
}
public Entry newEntry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
return new Entry(initDate, depth, url, name, requestHeader, responseStatus, responseHeader, initiator, profile);
@ -430,108 +441,108 @@ public final class plasmaHTCache {
public final class Entry {
// the class objects
public Date initDate; // the date when the request happened; will be used as a key
public int depth; // the depth of prefetching
public httpHeader requestHeader; // we carry also the header to prevent too many file system access
public String responseStatus;
public httpHeader responseHeader; // we carry also the header to prevent too many file system access
public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array
public URL url;
public String name; // the name of the link, read as anchor from an <a>-tag
public String nomalizedURLHash;
public String nomalizedURLString;
public int status; // cache load/hit/stale etc status
public Date lastModified;
public char doctype;
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
// normalize url
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
try {
this.url = new URL(nomalizedURLString);
} catch (MalformedURLException e) {
System.out.println("internal error at httpdProxyCache.Entry: " + e);
System.exit(-1);
}
this.name = name;
this.cacheFile = getCachePath(this.url);
this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
// assigned:
this.initDate = initDate;
this.depth = depth;
this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
this.responseHeader = responseHeader;
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
// calculated:
if (responseHeader == null) {
try {
throw new RuntimeException("RESPONSE HEADER = NULL");
} catch (Exception e) {
System.out.println("RESPONSE HEADER = NULL in " + url);
e.printStackTrace();
System.exit(0);
}
lastModified = serverDate.correctedGMTDate();
} else {
lastModified = responseHeader.lastModified();
if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header
}
this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
this.language = plasmaWordIndexEntry.language(url);
// to be defined later:
this.cacheArray = null;
}
public String name() {
return name;
}
public String initiator() {
return initiator;
// the class objects
public Date initDate; // the date when the request happened; will be used as a key
public int depth; // the depth of prefetching
public httpHeader requestHeader; // we carry also the header to prevent too many file system access
public String responseStatus;
public httpHeader responseHeader; // we carry also the header to prevent too many file system access
public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array
public URL url;
public String name; // the name of the link, read as anchor from an <a>-tag
public String nomalizedURLHash;
public String nomalizedURLString;
public int status; // cache load/hit/stale etc status
public Date lastModified;
public char doctype;
public String language;
public plasmaCrawlProfile.entry profile;
private String initiator;
public Entry(Date initDate, int depth, URL url, String name,
httpHeader requestHeader,
String responseStatus, httpHeader responseHeader,
String initiator,
plasmaCrawlProfile.entry profile) {
// normalize url - Borg-0300
serverLog.logDebug("PLASMA", "Entry: URL=" + url.toString());
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
try {
this.url = new URL(nomalizedURLString);
} catch (MalformedURLException e) {
System.out.println("internal error at httpdProxyCache.Entry: " + e);
System.exit(-1);
}
public boolean proxy() {
return initiator() == null;
this.name = name;
this.cacheFile = getCachePath(this.url);
this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
// assigned:
this.initDate = initDate;
this.depth = depth;
this.requestHeader = requestHeader;
this.responseStatus = responseStatus;
this.responseHeader = responseHeader;
this.profile = profile;
this.initiator = (initiator == null) ? null : ((initiator.length() == 0) ? null: initiator);
// calculated:
if (responseHeader == null) {
try {
throw new RuntimeException("RESPONSE HEADER = NULL");
} catch (Exception e) {
System.out.println("RESPONSE HEADER = NULL in " + url);
e.printStackTrace();
System.exit(0);
}
lastModified = serverDate.correctedGMTDate();
} else {
lastModified = responseHeader.lastModified();
if (lastModified == null) lastModified = serverDate.correctedGMTDate(); // does not exist in header
}
public long size() {
if (cacheArray == null) return 0; else return cacheArray.length;
}
this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
this.language = plasmaWordIndexEntry.language(url);
public URL referrerURL() {
if (requestHeader == null) return null;
try {
return new URL((String) requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
// to be defined later:
this.cacheArray = null;
}
public String name() {
return name;
}
public String initiator() {
return initiator;
}
public boolean proxy() {
return initiator() == null;
}
public long size() {
if (cacheArray == null) return 0; else return cacheArray.length;
}
public URL referrerURL() {
if (requestHeader == null) return null;
try {
return new URL((String) requestHeader.get(httpHeader.REFERER, ""));
} catch (Exception e) {
return null;
}
}
/*
/*
public boolean update() {
return ((status == CACHE_FILL) || (status == CACHE_STALE_RELOAD_GOOD));
}
*/
// the following three methods for cache read/write granting shall be as loose as possible
// but also as strict as necessary to enable caching of most items
// the following three methods for cache read/write granting shall be as loose as possible
// but also as strict as necessary to enable caching of most items
public String shallStoreCacheForProxy() {
public String shallStoreCacheForProxy() {
// returns NULL if the answer is TRUE
// in case of FALSE, the reason as String is returned

@ -100,25 +100,25 @@
package de.anomic.plasma;
import java.io.BufferedReader;
// import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
// import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
// import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Enumeration;
// import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.TreeMap;
import java.util.Vector;
// import java.util.TreeMap;
// import java.util.Vector;
import de.anomic.data.messageBoard;
import de.anomic.data.wikiBoard;
@ -130,24 +130,23 @@ import de.anomic.kelondro.kelondroTables;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
// import de.anomic.server.serverDate;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSemaphore;
import de.anomic.server.serverSwitch;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverFileUtils;
// import de.anomic.server.serverFileUtils;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacySeedDB;
// import de.anomic.yacy.yacySeedDB;
public final class plasmaSwitchboard extends serverAbstractSwitch implements serverSwitch {
// load slots
public static int crawlSlots = 10;
public static int indexingSlots = 100;
@ -158,7 +157,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static plasmaURLPattern urlBlacklist;
// storage management
private File cachePath;
private File cachePath; // do we need that ?
private File plasmaPath;
public File listsPath;
public plasmaURLPool urlPool;
@ -192,13 +191,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private static plasmaSwitchboard sb;
public plasmaSwitchboard(String rootPath, String initPath, String configPath) throws IOException {
super(rootPath, initPath, configPath);
super(rootPath, initPath, configPath);
// set loglevel and log
setLog(new serverLog("PLASMA"));
setLog(new serverLog("PLASMA"));
// load values from configs
plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB"));
// load values from configs
plasmaPath = new File(rootPath, getConfig("dbPath", "PLASMADB"));
listsPath = new File(rootPath, getConfig("listsPath", "LISTS"));
remoteProxyHost = getConfig("remoteProxyHost", "");
try {
@ -217,12 +216,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (!(listsPath.exists())) listsPath.mkdirs();
// load coloured lists
if (blueList == null) {
// read only once upon first instantiation of this class
String f = getConfig("plasmaBlueList", null);
if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet();
}
// load coloured lists
if (blueList == null) {
// read only once upon first instantiation of this class
String f = getConfig("plasmaBlueList", null);
if (f != null) blueList = kelondroMSetTools.loadList(new File(f)); else blueList= new TreeSet();
}
// load the black-list / inspired by [AS]
urlBlacklist = new plasmaURLPattern(new File(getRootPath(), getConfig("listsPath", "DATA/LISTS")));
@ -238,7 +237,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
stopwords = kelondroMSetTools.loadList(new File(rootPath, "yacy.stopwords"));
}
// read memory amount
// read memory amount
int ramLURL = Integer.parseInt(getConfig("ramCacheLURL", "1024")) / 1024;
int ramNURL = Integer.parseInt(getConfig("ramCacheNURL", "1024")) / 1024;
int ramEURL = Integer.parseInt(getConfig("ramCacheEURL", "1024")) / 1024;
@ -254,7 +253,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSystem("Message Cache memory = " + ppRamString(ramMessage));
log.logSystem("Wiki Cache memory = " + ppRamString(ramWiki));
// make crawl profiles database and default profiles
// make crawl profiles database and default profiles
log.logSystem("Initializing Crawl Profiles");
profiles = new plasmaCrawlProfile(new File(plasmaPath, "crawlProfiles0.db"));
initProfiles();
@ -270,7 +269,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start a cache manager
log.logSystem("Starting HT Cache Manager");
File htCachePath = new File(getRootPath(), getConfig("proxyCache","HTCACHE"));
// create the Cache directorie - Borg-0300
String cp = getConfig("proxyCache", "DATA/HTCACHE");
cp = cp.replace('\\', '/');
if (cp.endsWith("/")) cp = cp.substring(0,cp.length() - 1);
File htCachePath = new File(cp);
if (!(htCachePath.exists())) htCachePath.mkdirs();
if (!(htCachePath.isDirectory())) {
// if the cache does not exists or is a file and not a directory, panic
serverLog.logSystem("PLASMA", "the cache path " + htCachePath.toString() + " is not a directory or does not exists and cannot be created");
System.exit(0);
} else {
serverLog.logInfo("PLASMA", "proxyCache=" + cp);
}
long maxCacheSize = 1024 * 1024 * Long.parseLong(getConfig("proxyCacheSize", "2")); // this is megabyte
this.cacheManager = new plasmaHTCache(htCachePath, maxCacheSize, ramHTTP);
@ -309,7 +322,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
messageDB = new messageBoard(new File(getRootPath(), "DATA/SETTINGS/message.db"), ramMessage);
log.logSystem("Starting Wiki Board");
wikiDB = new wikiBoard(new File(getRootPath(), "DATA/SETTINGS/wiki.db"),
new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
new File(getRootPath(), "DATA/SETTINGS/wiki-bkp.db"), ramWiki);
// init cookie-Monitor
log.logSystem("Starting Cookie Monitor");

Loading…
Cancel
Save