From 3fcc95a82c8e596bee09d86bab107f322a984d4b Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 24 Sep 2005 00:33:27 +0000 Subject: [PATCH] integrated crawl-profiles db in memory-performance monitor git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@788 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/PerformanceMemory_p.html | 41 +++++++++++++------ htroot/PerformanceMemory_p.java | 7 ++++ .../de/anomic/plasma/plasmaCrawlProfile.java | 17 ++++++-- .../anomic/plasma/plasmaCrawlRobotsTxt.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 23 ++++++----- yacy.init | 5 ++- 6 files changed, 69 insertions(+), 28 deletions(-) diff --git a/htroot/PerformanceMemory_p.html b/htroot/PerformanceMemory_p.html index 4a96a6ed8..204d1875a 100644 --- a/htroot/PerformanceMemory_p.html +++ b/htroot/PerformanceMemory_p.html @@ -108,7 +108,7 @@ #[slmedRWI]# #[sllowRWI]# #[usedRWI]# - + #[dfltRWI]# #[goodRWI]# #[bestRWI]# @@ -126,7 +126,7 @@ cache will speed up crawls with a depth > 3. #[slmedHTTP]# #[sllowHTTP]# #[usedHTTP]# - + #[dfltHTTP]# #[goodHTTP]# #[bestHTTP]# @@ -144,7 +144,7 @@ Increasing this cache will be most important for a fast proxy mode. #[slmedLURL]# #[sllowLURL]# #[usedLURL]# - + #[dfltLURL]# #[goodLURL]# #[bestLURL]# @@ -161,7 +161,7 @@ This cache is very important for a fast search process. Increasing the cache siz #[slmedNURL]# #[sllowNURL]# #[usedNURL]# - + #[dfltNURL]# #[goodNURL]# #[bestNURL]# @@ -178,7 +178,7 @@ Increasing the cache size will result in faster double-check during URL recognit #[slmedEURL]# #[sllowEURL]# #[usedEURL]# - + #[dfltEURL]# #[goodEURL]# #[bestEURL]# @@ -195,7 +195,7 @@ Increasing the cache size will most probably speed up crawling slightly, but not #[slmedDHT]# #[sllowDHT]# #[usedDHT]# - + #[dfltDHT]# #[goodDHT]# #[bestDHT]# @@ -213,7 +213,7 @@ Increasing this cache may speed up many functions, but we need to test this to s #[slmedMessage]# #[sllowMessage]# #[usedMessage]# - + #[dfltMessage]# #[goodMessage]# #[bestMessage]# @@ -229,7 +229,7 @@ Increasing this cache may speed up many functions, but we need to test this to s #[slmedWiki]# #[sllowWiki]# #[usedWiki]# - + #[dfltWiki]# #[goodWiki]# #[bestWiki]# @@ -247,7 +247,7 @@ Increasing this cache may speed up access to the wiki pages. #[slmedNews]# #[sllowNews]# #[usedNews]# - + #[dfltNews]# #[goodNews]# #[bestNews]# @@ -256,7 +256,7 @@ Increasing this cache may speed up the peer-ping. -Robots.txt DB +robots.txt DB #[chunkRobots]# #[slreqRobots]# #[slempRobots]# @@ -264,14 +264,31 @@ Increasing this cache may speed up the peer-ping. #[slmedRobots]# #[sllowRobots]# #[usedRobots]# - + #[dfltRobots]# #[goodRobots]# #[bestRobots]# -The Robots.txt DB stores downloaded records from robots.txt files. +The robots.txt DB stores downloaded records from robots.txt files. Increasing this cache may speed up validation if crawling of the URL is allowed. + +Crawl Profiles +#[chunkProfiles]# +#[slreqProfiles]# +#[slempProfiles]# +#[slhigProfiles]# +#[slmedProfiles]# +#[sllowProfiles]# +#[usedProfiles]# + +#[dfltProfiles]# +#[goodProfiles]# +#[bestProfiles]# +The profile database stores properties for each crawl that is started on the local peer. +Increasing this cache mey speed up crawling, but not much space is needed. + + Totals #[usedTotal]# MB diff --git a/htroot/PerformanceMemory_p.java b/htroot/PerformanceMemory_p.java index a17f14577..394831ec7 100644 --- a/htroot/PerformanceMemory_p.java +++ b/htroot/PerformanceMemory_p.java @@ -85,6 +85,7 @@ public class PerformanceMemory_p { env.setConfig("ramCacheWiki", Long.parseLong(post.get("ramCacheWiki", "0")) * KB); env.setConfig("ramCacheNews", Long.parseLong(post.get("ramCacheNews", "0")) * KB); env.setConfig("ramCacheRobots", Long.parseLong(post.get("ramCacheRobots", "0")) * KB); + env.setConfig("ramCacheProfiles", Long.parseLong(post.get("ramCacheProfiles", "0")) * KB); } if (post.containsKey("setDefault")) { env.setConfig("ramCacheRWI", Long.parseLong((String) defaultSettings.get("ramCacheRWI"))); @@ -97,6 +98,7 @@ public class PerformanceMemory_p { env.setConfig("ramCacheWiki", Long.parseLong((String) defaultSettings.get("ramCacheWiki"))); env.setConfig("ramCacheNews", Long.parseLong((String) defaultSettings.get("ramCacheNews"))); env.setConfig("ramCacheRobots", Long.parseLong((String) defaultSettings.get("ramCacheRobots"))); + env.setConfig("ramCacheProfiles", Long.parseLong((String) defaultSettings.get("ramCacheProfiles"))); } if (post.containsKey("setGood")) set = "setGood"; if (post.containsKey("setBest")) set = "setBest"; @@ -196,6 +198,11 @@ public class PerformanceMemory_p { slt = sb.robots.dbCacheFillStatus(); putprop(prop, env, "Robots", set); + req = sb.profiles.size(); + chk = sb.profiles.dbCacheChunkSize(); + slt = sb.profiles.dbCacheFillStatus(); + putprop(prop, env, "Profiles", set); + prop.put("usedTotal", usedTotal / MB); prop.put("currTotal", currTotal / MB); prop.put("dfltTotal", dfltTotal / MB); diff --git a/source/de/anomic/plasma/plasmaCrawlProfile.java b/source/de/anomic/plasma/plasmaCrawlProfile.java index f0c41eef3..accc66d02 100644 --- a/source/de/anomic/plasma/plasmaCrawlProfile.java +++ b/source/de/anomic/plasma/plasmaCrawlProfile.java @@ -58,17 +58,26 @@ public class plasmaCrawlProfile { private kelondroMap profileTable; private File profileTableFile; + private int bufferkb; - public plasmaCrawlProfile(File profileTableFile) throws IOException { + public plasmaCrawlProfile(File profileTableFile, int bufferkb) throws IOException { this.profileTableFile = profileTableFile; if (profileTableFile.exists()) { - profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000)); + profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024)); } else { profileTableFile.getParentFile().mkdirs(); - profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000, plasmaURL.urlCrawlProfileHandleLength, 2000)); + profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024, plasmaURL.urlCrawlProfileHandleLength, 2000)); } } + public int[] dbCacheChunkSize() { + return profileTable.cacheChunkSize(); + } + + public int[] dbCacheFillStatus() { + return profileTable.cacheFillStatus(); + } + private void resetDatabase() { // deletes the profile database and creates a new one if (profileTable != null) try { @@ -77,7 +86,7 @@ public class plasmaCrawlProfile { if (!(profileTableFile.delete())) throw new RuntimeException("cannot delete crawl profile database"); try { profileTableFile.getParentFile().mkdirs(); - profileTable = new kelondroMap(new kelondroDyn(profileTableFile, 32000, plasmaURL.urlCrawlProfileHandleLength, 2000)); + profileTable = new kelondroMap(new kelondroDyn(profileTableFile, bufferkb * 1024, plasmaURL.urlCrawlProfileHandleLength, 2000)); } catch (IOException e){ serverLog.logSevere("PLASMA", "plasmaCrawlProfile.resetDatabase", e); } diff --git a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java index 6134c1100..1cfc8ebd3 100644 --- a/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java +++ b/source/de/anomic/plasma/plasmaCrawlRobotsTxt.java @@ -63,9 +63,11 @@ import de.anomic.server.logging.serverLog; public class plasmaCrawlRobotsTxt { private kelondroMap robotsTable; private File robotsTableFile; + private int bufferkb; public plasmaCrawlRobotsTxt(File robotsTableFile, int bufferkb) throws IOException { this.robotsTableFile = robotsTableFile; + this.bufferkb = bufferkb; if (robotsTableFile.exists()) { try { robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, bufferkb * 1024)); @@ -96,7 +98,7 @@ public class plasmaCrawlRobotsTxt { if (!(robotsTableFile.delete())) throw new RuntimeException("cannot delete robots.txt database"); try { robotsTableFile.getParentFile().mkdirs(); - robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, 1000000, 256, 512)); + robotsTable = new kelondroMap(new kelondroDyn(robotsTableFile, this.bufferkb, 256, 512)); } catch (IOException e){ serverLog.logSevere("PLASMA", "robotsTxt.resetDatabase", e); } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 9aebd3135..36fbdf360 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -256,19 +256,21 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int ramMessage = (int) getConfigLong("ramCacheMessage", 1024) / 1024; int ramWiki = (int) getConfigLong("ramCacheWiki", 1024) / 1024; int ramRobots = (int) getConfigLong("ramCacheRobots",1024) / 1024; - this.log.logConfig("LURL Cache memory = " + ppRamString(ramLURL)); - this.log.logConfig("NURL Cache memory = " + ppRamString(ramNURL)); - this.log.logConfig("EURL Cache memory = " + ppRamString(ramEURL)); - this.log.logConfig("RWI Cache memory = " + ppRamString(ramRWI)); - this.log.logConfig("HTTP Cache memory = " + ppRamString(ramHTTP)); - this.log.logConfig("Message Cache memory = " + ppRamString(ramMessage)); - this.log.logConfig("Wiki Cache memory = " + ppRamString(ramWiki)); - this.log.logConfig("Robots Cache memory = " + ppRamString(ramRobots)); + int ramProfiles= (int) getConfigLong("ramCacheProfiles",1024) / 1024; + this.log.logConfig("LURL Cache memory = " + ppRamString(ramLURL)); + this.log.logConfig("NURL Cache memory = " + ppRamString(ramNURL)); + this.log.logConfig("EURL Cache memory = " + ppRamString(ramEURL)); + this.log.logConfig("RWI Cache memory = " + ppRamString(ramRWI)); + this.log.logConfig("HTTP Cache memory = " + ppRamString(ramHTTP)); + this.log.logConfig("Message Cache memory = " + ppRamString(ramMessage)); + this.log.logConfig("Wiki Cache memory = " + ppRamString(ramWiki)); + this.log.logConfig("Robots Cache memory = " + ppRamString(ramRobots)); + this.log.logConfig("Profiles Cache memory = " + ppRamString(ramProfiles)); // make crawl profiles database and default profiles this.log.logConfig("Initializing Crawl Profiles"); File profilesFile = new File(this.plasmaPath, "crawlProfiles0.db"); - this.profiles = new plasmaCrawlProfile(profilesFile); + this.profiles = new plasmaCrawlProfile(profilesFile, ramProfiles); initProfiles(); log.logConfig("Loaded profiles from file " + profilesFile + ", " + this.profiles.size() + " entries"); @@ -501,7 +503,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser final File pdb = new File(plasmaPath, "crawlProfiles0.db"); if (pdb.exists()) pdb.delete(); try { - profiles = new plasmaCrawlProfile(pdb); + int ramProfiles = (int) getConfigLong("ramCacheProfiles",1024) / 1024; + profiles = new plasmaCrawlProfile(pdb, ramProfiles); initProfiles(); } catch (IOException e) {} } diff --git a/yacy.init b/yacy.init index b46805955..5fca7ffd5 100644 --- a/yacy.init +++ b/yacy.init @@ -465,7 +465,10 @@ ramCacheWiki = 8192 ramCacheNews = 8192 # ram cache for robotsTxt.db -ramCacheRobots = 1048576 +ramCacheRobots = 2097152 + +# ram cache for crawlProfile.db +ramCacheProfiles = 8192 # default memory settings for startup of yacy # is only valid in unix/shell environments and