From 995da28c73d604ac785add08b0ba4e6ca7891391 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 2 Jul 2009 17:01:23 +0000 Subject: [PATCH] all stack/heap files that had been stored in DATA/PLASMA are now stored in the network-specific QUEUES path There is no migration. All crawls must be restarted. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6167 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 4 - htroot/IndexImport_p.java | 6 +- source/de/anomic/crawler/CrawlQueues.java | 2 +- .../de/anomic/crawler/CrawlSwitchboard.java | 8 +- .../de/anomic/crawler/ResourceObserver.java | 4 +- .../de/anomic/plasma/plasmaSwitchboard.java | 104 +++++++++++------- .../plasma/plasmaSwitchboardConstants.java | 8 +- source/yacy.java | 4 +- 8 files changed, 77 insertions(+), 63 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 9e39d44b9..69d6e3263 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -278,14 +278,10 @@ promoteSearchPageGreeting.homepage = http://yacy.net promoteSearchPageGreeting.largeImage = /env/grafics/YaCyLogo_120ppi.png promoteSearchPageGreeting.smallImage = /env/grafics/YaCyLogo_60ppi.png -# the path to the PLASMA database of the web spider -dbPath=DATA/PLASMADB - # the path to the public reverse word index for text files (web pages) # the primary path is relative to the data root, the secondary path is an absolute path # when the secondary path should be equal to the primary, it must be declared empty indexPrimaryPath=DATA/INDEX -indexSecondaryPath= # the commons are words that appear in the index more than 64k times in references. # Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index ad89437b2..530f90026 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -55,7 +55,11 @@ public final class IndexImport_p { try { final boolean startImport = true; if (startImport) { - final Importer importerThread = new NoticeURLImporter(switchboard.plasmaPath, switchboard.crawlQueues, switchboard.crawler.profilesActiveCrawls, switchboard.dbImportManager); + final Importer importerThread = new NoticeURLImporter( + switchboard.queuesRoot, + switchboard.crawlQueues, + switchboard.crawler.profilesActiveCrawls, + switchboard.dbImportManager); if (importerThread != null) { importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID()); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 656812bb1..610a3e6d6 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -242,7 +242,7 @@ public class CrawlQueues { // check if the protocol is supported final yacyURL url = urlEntry.url(); final String urlProtocol = url.getProtocol(); - if (this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) { + if (this.isSupportedProtocol(urlProtocol)) { if (this.log.isFine()) log.logFine(stats + ": URL=" + urlEntry.url() diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 1b3b6fcc8..4e74e5b95 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -134,13 +134,7 @@ public final class CrawlSwitchboard { ", " + profilesPassiveFile.length()/1024); // init queues - final File preStackFile = new File(queuesRoot, "urlNoticePreStack"); - if (!preStackFile.exists()) { - // migrate old file - final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "switchboardQueue.stack"); - if (oldFile.exists()) oldFile.renameTo(preStackFile); - } - this.queuePreStack = new IndexingStack(peers, preStackFile, this.profilesActiveCrawls); + this.queuePreStack = new IndexingStack(peers, new File(queuesRoot, "urlNoticePreStack.stack"), this.profilesActiveCrawls); } public void clear() { diff --git a/source/de/anomic/crawler/ResourceObserver.java b/source/de/anomic/crawler/ResourceObserver.java index 5dd2b981a..37965372a 100644 --- a/source/de/anomic/crawler/ResourceObserver.java +++ b/source/de/anomic/crawler/ResourceObserver.java @@ -67,10 +67,10 @@ public final class ResourceObserver { final ArrayList pathsToCheck = new ArrayList(); // FIXME whats about the secondary path??? // = (getConfig(plasmaSwitchboard.INDEX_SECONDARY_PATH, ""); - final String[] pathes = {plasmaSwitchboardConstants.HTDOCS_PATH, + final String[] pathes = { + plasmaSwitchboardConstants.HTDOCS_PATH, plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.LISTS_PATH, - plasmaSwitchboardConstants.PLASMA_PATH, plasmaSwitchboardConstants.RANKING_PATH, plasmaSwitchboardConstants.WORK_PATH}; String path; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 21df95f99..62fa8c141 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -218,12 +218,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch rankingPermissions; @@ -300,8 +301,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch= 0, "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); + + // create new web structure + this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map")); + + // load the robots.txt database + this.log.logConfig("Initializing robots.txt DB"); + final File robotsDBFile = new File(this.queuesRoot, "crawlRobotsTxt.heap"); + this.robots = new RobotsTxt(robotsDBFile); + this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() + + ", " + robots.size() + " entries" + + ", " + ppRamString(robotsDBFile.length()/1024)); + + // start a loader + log.logConfig("Starting Crawl Loader"); + this.crawlQueues = new CrawlQueues(this, this.queuesRoot); + this.crawlQueues.noticeURL.setMinimumDelta( + this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), + this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); + } // start up crawl jobs continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); diff --git a/source/de/anomic/plasma/plasmaSwitchboardConstants.java b/source/de/anomic/plasma/plasmaSwitchboardConstants.java index 3ad2c08b7..19f635275 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardConstants.java +++ b/source/de/anomic/plasma/plasmaSwitchboardConstants.java @@ -321,13 +321,7 @@ public final class plasmaSwitchboardConstants { public static final String LIST_BLUE_DEFAULT = null; public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords"; public static final String LIST_STOPWORDS_DEFAULT = "yacy.stopwords"; - /** - *

public static final String DBPATH = "dbPath"

- *

Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all - * databases containing queues are stored

- */ - public static final String PLASMA_PATH = "dbPath"; - public static final String PLASMA_PATH_DEFAULT = "DATA/PLASMADB"; + /** *

public static final String HTCACHE_PATH = "proxyCache"

*

Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all diff --git a/source/yacy.java b/source/yacy.java index f827a3353..ebb44fc3f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -628,15 +628,13 @@ public final class yacy { System.out.println(copyright); System.out.println(hline); - final Properties config = configuration("GEN-WORDSTAT", homePath); - // load words Log.logInfo("GEN-WORDSTAT", "loading words..."); final TreeMap words = loadWordMap(new File(homePath, "yacy.words")); // find all hashes Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases..."); - final File dbRoot = new File(homePath, config.getProperty("dbPath")); + final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/"); final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true); File f; byte[] h;