integrated crawl-profiles db in memory-performance monitor

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@788 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 72ce36baba
commit 3fcc95a82c

@ -108,7 +108,7 @@
<td class="small" align="right">#[slmedRWI]#</td>
<td class="small" align="right">#[sllowRWI]#</td>
<td class="small" align="right">#[usedRWI]#</td>
<td class="small" align="right"><input name="ramCacheRWI" type="text" align="right" size="5" maxlength="6" value="#[ramCacheRWI]#"></td>
<td class="small" align="right"><input name="ramCacheRWI" type="text" align="right" size="6" maxlength="7" value="#[ramCacheRWI]#"></td>
<td class="small" align="right">#[dfltRWI]#</td>
<td class="small" align="right">#[goodRWI]#</td>
<td class="small" align="right">#[bestRWI]#</td>
@ -126,7 +126,7 @@ cache will speed up crawls with a depth > 3.</td>
<td class="small" align="right">#[slmedHTTP]#</td>
<td class="small" align="right">#[sllowHTTP]#</td>
<td class="small" align="right">#[usedHTTP]#</td>
<td class="small" align="right"><input name="ramCacheHTTP" type="text" align="right" size="5" maxlength="6" value="#[ramCacheHTTP]#"></td>
<td class="small" align="right"><input name="ramCacheHTTP" type="text" align="right" size="6" maxlength="7" value="#[ramCacheHTTP]#"></td>
<td class="small" align="right">#[dfltHTTP]#</td>
<td class="small" align="right">#[goodHTTP]#</td>
<td class="small" align="right">#[bestHTTP]#</td>
@ -144,7 +144,7 @@ Increasing this cache will be most important for a fast proxy mode.</td>
<td class="small" align="right">#[slmedLURL]#</td>
<td class="small" align="right">#[sllowLURL]#</td>
<td class="small" align="right">#[usedLURL]#</td>
<td class="small" align="right"><input name="ramCacheLURL" type="text" align="right" size="5" maxlength="6" value="#[ramCacheLURL]#"></td>
<td class="small" align="right"><input name="ramCacheLURL" type="text" align="right" size="6" maxlength="7" value="#[ramCacheLURL]#"></td>
<td class="small" align="right">#[dfltLURL]#</td>
<td class="small" align="right">#[goodLURL]#</td>
<td class="small" align="right">#[bestLURL]#</td>
@ -161,7 +161,7 @@ This cache is very important for a fast search process. Increasing the cache siz
<td class="small" align="right">#[slmedNURL]#</td>
<td class="small" align="right">#[sllowNURL]#</td>
<td class="small" align="right">#[usedNURL]#</td>
<td class="small" align="right"><input name="ramCacheNURL" type="text" align="right" size="5" maxlength="6" value="#[ramCacheNURL]#"></td>
<td class="small" align="right"><input name="ramCacheNURL" type="text" align="right" size="6" maxlength="7" value="#[ramCacheNURL]#"></td>
<td class="small" align="right">#[dfltNURL]#</td>
<td class="small" align="right">#[goodNURL]#</td>
<td class="small" align="right">#[bestNURL]#</td>
@ -178,7 +178,7 @@ Increasing the cache size will result in faster double-check during URL recognit
<td class="small" align="right">#[slmedEURL]#</td>
<td class="small" align="right">#[sllowEURL]#</td>
<td class="small" align="right">#[usedEURL]#</td>
<td class="small" align="right"><input name="ramCacheEURL" type="text" align="right" size="5" maxlength="6" value="#[ramCacheEURL]#"></td>
<td class="small" align="right"><input name="ramCacheEURL" type="text" align="right" size="6" maxlength="7" value="#[ramCacheEURL]#"></td>
<td class="small" align="right">#[dfltEURL]#</td>
<td class="small" align="right">#[goodEURL]#</td>
<td class="small" align="right">#[bestEURL]#</td>
@ -195,7 +195,7 @@ Increasing the cache size will most probably speed up crawling slightly, but not
<td class="small" align="right">#[slmedDHT]#</td>
<td class="small" align="right">#[sllowDHT]#</td>
<td class="small" align="right">#[usedDHT]#</td>
<td class="small" align="right"><input name="ramCacheDHT" type="text" align="right" size="5" maxlength="6" value="#[ramCacheDHT]#"></td>
<td class="small" align="right"><input name="ramCacheDHT" type="text" align="right" size="6" maxlength="7" value="#[ramCacheDHT]#"></td>
<td class="small" align="right">#[dfltDHT]#</td>
<td class="small" align="right">#[goodDHT]#</td>
<td class="small" align="right">#[bestDHT]#</td>
@ -213,7 +213,7 @@ Increasing this cache may speed up many functions, but we need to test this to s
<td class="small" align="right">#[slmedMessage]#</td>
<td class="small" align="right">#[sllowMessage]#</td>
<td class="small" align="right">#[usedMessage]#</td>
<td class="small" align="right"><input name="ramCacheMessage" type="text" align="right" size="5" maxlength="6" value="#[ramCacheMessage]#"></td>
<td class="small" align="right"><input name="ramCacheMessage" type="text" align="right" size="6" maxlength="7" value="#[ramCacheMessage]#"></td>
<td class="small" align="right">#[dfltMessage]#</td>
<td class="small" align="right">#[goodMessage]#</td>
<td class="small" align="right">#[bestMessage]#</td>
@ -229,7 +229,7 @@ Increasing this cache may speed up many functions, but we need to test this to s
<td class="small" align="right">#[slmedWiki]#</td>
<td class="small" align="right">#[sllowWiki]#</td>
<td class="small" align="right">#[usedWiki]#</td>
<td class="small" align="right"><input name="ramCacheWiki" type="text" align="right" size="5" maxlength="6" value="#[ramCacheWiki]#"></td>
<td class="small" align="right"><input name="ramCacheWiki" type="text" align="right" size="6" maxlength="7" value="#[ramCacheWiki]#"></td>
<td class="small" align="right">#[dfltWiki]#</td>
<td class="small" align="right">#[goodWiki]#</td>
<td class="small" align="right">#[bestWiki]#</td>
@ -247,7 +247,7 @@ Increasing this cache may speed up access to the wiki pages.</td>
<td class="small" align="right">#[slmedNews]#</td>
<td class="small" align="right">#[sllowNews]#</td>
<td class="small" align="right">#[usedNews]#</td>
<td class="small" align="right"><input name="ramCacheNews" type="text" align="right" size="5" maxlength="6" value="#[ramCacheNews]#"></td>
<td class="small" align="right"><input name="ramCacheNews" type="text" align="right" size="6" maxlength="7" value="#[ramCacheNews]#"></td>
<td class="small" align="right">#[dfltNews]#</td>
<td class="small" align="right">#[goodNews]#</td>
<td class="small" align="right">#[bestNews]#</td>
@ -256,7 +256,7 @@ Increasing this cache may speed up the peer-ping.</td>
</tr>
<tr class="TableCellDark">
<td class="small" align="left">Robots.txt DB</td>
<td class="small" align="left">robots.txt DB</td>
<td class="small" align="center">#[chunkRobots]#</td>
<td class="small" align="right">#[slreqRobots]#</td>
<td class="small" align="right">#[slempRobots]#</td>
@ -264,14 +264,31 @@ Increasing this cache may speed up the peer-ping.</td>
<td class="small" align="right">#[slmedRobots]#</td>
<td class="small" align="right">#[sllowRobots]#</td>
<td class="small" align="right">#[usedRobots]#</td>
<td class="small" align="right"><input name="ramCacheRobots" type="text" align="right" size="5" maxlength="6" value="#[ramCacheRobots]#"></td>
<td class="small" align="right"><input name="ramCacheRobots" type="text" align="right" size="6" maxlength="7" value="#[ramCacheRobots]#"></td>
<td class="small" align="right">#[dfltRobots]#</td>
<td class="small" align="right">#[goodRobots]#</td>
<td class="small" align="right">#[bestRobots]#</td>
<td class="small" align="left">The Robots.txt DB stores downloaded records from robots.txt files.
<td class="small" align="left">The robots.txt DB stores downloaded records from robots.txt files.
Increasing this cache may speed up validation if crawling of the URL is allowed.</td>
</tr>
<tr class="TableCellDark">
<td class="small" align="left">Crawl Profiles</td>
<td class="small" align="center">#[chunkProfiles]#</td>
<td class="small" align="right">#[slreqProfiles]#</td>
<td class="small" align="right">#[slempProfiles]#</td>
<td class="small" align="right">#[slhigProfiles]#</td>
<td class="small" align="right">#[slmedProfiles]#</td>
<td class="small" align="right">#[sllowProfiles]#</td>
<td class="small" align="right">#[usedProfiles]#</td>
<td class="small" align="right"><input name="ramCacheProfiles" type="text" align="right" size="6" maxlength="7" value="#[ramCacheProfiles]#"></td>
<td class="small" align="right">#[dfltProfiles]#</td>
<td class="small" align="right">#[goodProfiles]#</td>
<td class="small" align="right">#[bestProfiles]#</td>
<td class="small" align="left">The profile database stores properties for each crawl that is started on the local peer.
Increasing this cache mey speed up crawling, but not much space is needed.</td>
</tr>
<tr class="TableCellSummary">
<td class="small" align="left" colspan="7">Totals</td>
<td class="small" align="right">#[usedTotal]# MB</td>

@ -85,6 +85,7 @@ public class PerformanceMemory_p {
env.setConfig("ramCacheWiki", Long.parseLong(post.get("ramCacheWiki", "0")) * KB);
env.setConfig("ramCacheNews", Long.parseLong(post.get("ramCacheNews", "0")) * KB);
env.setConfig("ramCacheRobots", Long.parseLong(post.get("ramCacheRobots", "0")) * KB);
env.setConfig("ramCacheProfiles", Long.parseLong(post.get("ramCacheProfiles", "0")) * KB);
}
if (post.containsKey("setDefault")) {
env.setConfig("ramCacheRWI", Long.parseLong((String) defaultSettings.get("ramCacheRWI")));
@ -97,6 +98,7 @@ public class PerformanceMemory_p {
env.setConfig("ramCacheWiki", Long.parseLong((String) defaultSettings.get("ramCacheWiki")));
env.setConfig("ramCacheNews", Long.parseLong((String) defaultSettings.get("ramCacheNews")));
env.setConfig("ramCacheRobots", Long.parseLong((String) defaultSettings.get("ramCacheRobots")));
env.setConfig("ramCacheProfiles", Long.parseLong((String) defaultSettings.get("ramCacheProfiles")));
}
if (post.containsKey("setGood")) set = "setGood";
if (post.containsKey("setBest")) set = "setBest";
@ -196,6 +198,11 @@ public class PerformanceMemory_p {
slt = sb.robots.dbCacheFillStatus();
putprop(prop, env, "Robots", set);
req = sb.profiles.size();
chk = sb.profiles.dbCacheChunkSize();
slt = sb.profiles.dbCacheFillStatus();
putprop(prop, env, "Profiles", set);
prop.put("usedTotal", usedTotal / MB);
prop.put("currTotal", currTotal / MB);
prop.put("dfltTotal", dfltTotal / MB);

@ -58,17 +58,26 @@ public class plasmaCrawlProfile {
private kelondroMap profileTable;
private File profileTableFile;
private int bufferkb;
public plasmaCrawlProfile(File profileTableFile) throws IOException {
public plasmaCrawlProfile(File profileTableFile, int bufferkb) throws IOException {
this.profileTableFile = profileTableFile;
if (profileTableFile.exists()) {
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000));
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024));
} else {
profileTableFile.getParentFile().mkdirs();
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000, plasmaURL.urlCrawlProfileHandleLength, 2000));
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024, plasmaURL.urlCrawlProfileHandleLength, 2000));
}
}
public int[] dbCacheChunkSize() {
return profileTable.cacheChunkSize();
}
public int[] dbCacheFillStatus() {
return profileTable.cacheFillStatus();
}
private void resetDatabase() {
// deletes the profile database and creates a new one
if (profileTable != null) try {
@ -77,7 +86,7 @@ public class plasmaCrawlProfile {
if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database");
try {
profileTableFile.getParentFile().mkdirs();
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000, plasmaURL.urlCrawlProfileHandleLength, 2000));
profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024, plasmaURL.urlCrawlProfileHandleLength, 2000));
} catch (IOException e){
serverLog.logSevere("PLASMA", "plasmaCrawlProfile.resetDatabase", e);
}

@ -63,9 +63,11 @@ import de.anomic.server.logging.serverLog;
public class plasmaCrawlRobotsTxt {
private kelondroMap robotsTable;
private File robotsTableFile;
private int bufferkb;
public plasmaCrawlRobotsTxt(File robotsTableFile, int bufferkb) throws IOException {
this.robotsTableFile = robotsTableFile;
this.bufferkb = bufferkb;
if (robotsTableFile.exists()) {
try {
robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, bufferkb * 1024));
@ -96,7 +98,7 @@ public class plasmaCrawlRobotsTxt {
if (!(robotsTableFile.delete())) throw new RuntimeException("cannot delete robots.txt database");
try {
robotsTableFile.getParentFile().mkdirs();
robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, 1000000, 256, 512));
robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, this.bufferkb, 256, 512));
} catch (IOException e){
serverLog.logSevere("PLASMA", "robotsTxt.resetDatabase", e);
}

@ -256,19 +256,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
int ramMessage = (int) getConfigLong("ramCacheMessage", 1024) / 1024;
int ramWiki = (int) getConfigLong("ramCacheWiki", 1024) / 1024;
int ramRobots = (int) getConfigLong("ramCacheRobots",1024) / 1024;
this.log.logConfig("LURL Cache memory = " + ppRamString(ramLURL));
this.log.logConfig("NURL Cache memory = " + ppRamString(ramNURL));
this.log.logConfig("EURL Cache memory = " + ppRamString(ramEURL));
this.log.logConfig("RWI Cache memory = " + ppRamString(ramRWI));
this.log.logConfig("HTTP Cache memory = " + ppRamString(ramHTTP));
this.log.logConfig("Message Cache memory = " + ppRamString(ramMessage));
this.log.logConfig("Wiki Cache memory = " + ppRamString(ramWiki));
this.log.logConfig("Robots Cache memory = " + ppRamString(ramRobots));
int ramProfiles= (int) getConfigLong("ramCacheProfiles",1024) / 1024;
this.log.logConfig("LURL Cache memory = " + ppRamString(ramLURL));
this.log.logConfig("NURL Cache memory = " + ppRamString(ramNURL));
this.log.logConfig("EURL Cache memory = " + ppRamString(ramEURL));
this.log.logConfig("RWI Cache memory = " + ppRamString(ramRWI));
this.log.logConfig("HTTP Cache memory = " + ppRamString(ramHTTP));
this.log.logConfig("Message Cache memory = " + ppRamString(ramMessage));
this.log.logConfig("Wiki Cache memory = " + ppRamString(ramWiki));
this.log.logConfig("Robots Cache memory = " + ppRamString(ramRobots));
this.log.logConfig("Profiles Cache memory = " + ppRamString(ramProfiles));
// make crawl profiles database and default profiles
this.log.logConfig("Initializing Crawl Profiles");
File profilesFile = new File(this.plasmaPath, "crawlProfiles0.db");
this.profiles = new plasmaCrawlProfile(profilesFile);
this.profiles = new plasmaCrawlProfile(profilesFile, ramProfiles);
initProfiles();
log.logConfig("Loaded profiles from file " + profilesFile + ", " + this.profiles.size() + " entries");
@ -501,7 +503,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
final File pdb = new File(plasmaPath, "crawlProfiles0.db");
if (pdb.exists()) pdb.delete();
try {
profiles = new plasmaCrawlProfile(pdb);
int ramProfiles = (int) getConfigLong("ramCacheProfiles",1024) / 1024;
profiles = new plasmaCrawlProfile(pdb, ramProfiles);
initProfiles();
} catch (IOException e) {}
}

@ -465,7 +465,10 @@ ramCacheWiki = 8192
ramCacheNews = 8192
# ram cache for robotsTxt.db
ramCacheRobots = 1048576
ramCacheRobots = 2097152
# ram cache for crawlProfile.db
ramCacheProfiles = 8192
# default memory settings for startup of yacy
# is only valid in unix/shell environments and

Loading…
Cancel
Save