all stack/heap files that had been stored in DATA/PLASMA are now stored in the network-specific QUEUES path

There is no migration. All crawls must be restarted.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6167 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent aac89bf8ca
commit 995da28c73

@ -278,14 +278,10 @@ promoteSearchPageGreeting.homepage = http://yacy.net
promoteSearchPageGreeting.largeImage = /env/grafics/YaCyLogo_120ppi.png promoteSearchPageGreeting.largeImage = /env/grafics/YaCyLogo_120ppi.png
promoteSearchPageGreeting.smallImage = /env/grafics/YaCyLogo_60ppi.png promoteSearchPageGreeting.smallImage = /env/grafics/YaCyLogo_60ppi.png
# the path to the PLASMA database of the web spider
dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages) # the path to the public reverse word index for text files (web pages)
# the primary path is relative to the data root, the secondary path is an absolute path # the primary path is relative to the data root, the secondary path is an absolute path
# when the secondary path should be equal to the primary, it must be declared empty # when the secondary path should be equal to the primary, it must be declared empty
indexPrimaryPath=DATA/INDEX indexPrimaryPath=DATA/INDEX
indexSecondaryPath=
# the commons are words that appear in the index more than 64k times in references. # the commons are words that appear in the index more than 64k times in references.
# Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking # Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking

@ -55,7 +55,11 @@ public final class IndexImport_p {
try { try {
final boolean startImport = true; final boolean startImport = true;
if (startImport) { if (startImport) {
final Importer importerThread = new NoticeURLImporter(switchboard.plasmaPath, switchboard.crawlQueues, switchboard.crawler.profilesActiveCrawls, switchboard.dbImportManager); final Importer importerThread = new NoticeURLImporter(
switchboard.queuesRoot,
switchboard.crawlQueues,
switchboard.crawler.profilesActiveCrawls,
switchboard.dbImportManager);
if (importerThread != null) { if (importerThread != null) {
importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID()); importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID());

@ -242,7 +242,7 @@ public class CrawlQueues {
// check if the protocol is supported // check if the protocol is supported
final yacyURL url = urlEntry.url(); final yacyURL url = urlEntry.url();
final String urlProtocol = url.getProtocol(); final String urlProtocol = url.getProtocol();
if (this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) { if (this.isSupportedProtocol(urlProtocol)) {
if (this.log.isFine()) if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url() log.logFine(stats + ": URL=" + urlEntry.url()

@ -134,13 +134,7 @@ public final class CrawlSwitchboard {
", " + profilesPassiveFile.length()/1024); ", " + profilesPassiveFile.length()/1024);
// init queues // init queues
final File preStackFile = new File(queuesRoot, "urlNoticePreStack"); this.queuePreStack = new IndexingStack(peers, new File(queuesRoot, "urlNoticePreStack.stack"), this.profilesActiveCrawls);
if (!preStackFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "switchboardQueue.stack");
if (oldFile.exists()) oldFile.renameTo(preStackFile);
}
this.queuePreStack = new IndexingStack(peers, preStackFile, this.profilesActiveCrawls);
} }
public void clear() { public void clear() {

@ -67,10 +67,10 @@ public final class ResourceObserver {
final ArrayList<String> pathsToCheck = new ArrayList<String>(); final ArrayList<String> pathsToCheck = new ArrayList<String>();
// FIXME whats about the secondary path??? // FIXME whats about the secondary path???
// = (getConfig(plasmaSwitchboard.INDEX_SECONDARY_PATH, ""); // = (getConfig(plasmaSwitchboard.INDEX_SECONDARY_PATH, "");
final String[] pathes = {plasmaSwitchboardConstants.HTDOCS_PATH, final String[] pathes = {
plasmaSwitchboardConstants.HTDOCS_PATH,
plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PRIMARY_PATH,
plasmaSwitchboardConstants.LISTS_PATH, plasmaSwitchboardConstants.LISTS_PATH,
plasmaSwitchboardConstants.PLASMA_PATH,
plasmaSwitchboardConstants.RANKING_PATH, plasmaSwitchboardConstants.RANKING_PATH,
plasmaSwitchboardConstants.WORK_PATH}; plasmaSwitchboardConstants.WORK_PATH};
String path; String path;

@ -218,12 +218,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// storage management // storage management
public File htCachePath; public File htCachePath;
public File plasmaPath;
public File listsPath; public File listsPath;
public File htDocsPath; public File htDocsPath;
public File rankingPath; public File rankingPath;
public File workPath; public File workPath;
public File releasePath; public File releasePath;
public File networkRoot;
public File queuesRoot;
public File surrogatesInPath; public File surrogatesInPath;
public File surrogatesOutPath; public File surrogatesOutPath;
public Map<String, String> rankingPermissions; public Map<String, String> rankingPermissions;
@ -300,8 +301,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
httpRemoteProxyConfig.init(this); httpRemoteProxyConfig.init(this);
// load values from configs // load values from configs
this.plasmaPath = getConfigPath(plasmaSwitchboardConstants.PLASMA_PATH, plasmaSwitchboardConstants.PLASMA_PATH_DEFAULT);
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT); final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
this.log.logConfig("Index Primary Path: " + indexPrimaryPath.toString()); this.log.logConfig("Index Primary Path: " + indexPrimaryPath.toString());
this.listsPath = getConfigPath(plasmaSwitchboardConstants.LISTS_PATH, plasmaSwitchboardConstants.LISTS_PATH_DEFAULT); this.listsPath = getConfigPath(plasmaSwitchboardConstants.LISTS_PATH, plasmaSwitchboardConstants.LISTS_PATH_DEFAULT);
@ -332,10 +331,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE); final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1); final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0); final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
try { try {
final File networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK"); final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED);
networkRoot.mkdirs();
final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB( peers = new yacySeedDB(
networkRoot, networkRoot,
"seed.new.heap", "seed.new.heap",
@ -353,7 +354,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
peers, peers,
networkName, networkName,
log, log,
new File(new File(indexPrimaryPath, networkName), "QUEUES")); this.queuesRoot);
} catch (IOException e1) { } catch (IOException e1) {
e1.printStackTrace(); e1.printStackTrace();
indexSegment = null; indexSegment = null;
@ -381,7 +382,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.proxyLastAccess = System.currentTimeMillis() - 10000; this.proxyLastAccess = System.currentTimeMillis() - 10000;
this.localSearchLastAccess = System.currentTimeMillis() - 10000; this.localSearchLastAccess = System.currentTimeMillis() - 10000;
this.remoteSearchLastAccess = System.currentTimeMillis() - 10000; this.remoteSearchLastAccess = System.currentTimeMillis() - 10000;
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(plasmaPath, "webStructure.map")); this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
// configuring list path // configuring list path
if (!(listsPath.exists())) listsPath.mkdirs(); if (!(listsPath.exists())) listsPath.mkdirs();
@ -459,7 +460,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// loading the robots.txt db // loading the robots.txt db
this.log.logConfig("Initializing robots.txt DB"); this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(this.plasmaPath, "crawlRobotsTxt.heap"); final File robotsDBFile = new File(queuesRoot, "crawlRobotsTxt.heap");
robots = new RobotsTxt(robotsDBFile); robots = new RobotsTxt(robotsDBFile);
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() + this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" + ", " + robots.size() + " entries" +
@ -527,11 +528,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// start a loader // start a loader
log.logConfig("Starting Crawl Loader"); log.logConfig("Starting Crawl Loader");
this.crawlQueues = new CrawlQueues(this, plasmaPath); this.crawlQueues = new CrawlQueues(this, queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta( this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta())); this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
/* /*
* Creating sync objects and loading status for the crawl jobs * Creating sync objects and loading status for the crawl jobs
* a) local crawl * a) local crawl
@ -804,14 +805,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
proxyLastAccess = System.currentTimeMillis() + 10000; // at least 10 seconds online caution to prevent unnecessary action on database meanwhile proxyLastAccess = System.currentTimeMillis() + 10000; // at least 10 seconds online caution to prevent unnecessary action on database meanwhile
// clean search events which have cached relations to the old index // clean search events which have cached relations to the old index
plasmaSearchEvent.cleanupEvents(true); plasmaSearchEvent.cleanupEvents(true);
// switch the networks // switch the networks
synchronized (this) { synchronized (this) {
// shut down // shut down
synchronized (this.indexSegment) { synchronized (this.indexSegment) {
this.indexSegment.close(); this.indexSegment.close();
} }
this.crawlStacker.announceClose(); this.crawlStacker.announceClose();
this.crawlStacker.close(); this.crawlStacker.close();
this.webStructure.close();
this.robots.close();
this.crawlQueues.close();
// start up // start up
setConfig("network.unit.definition", networkDefinition); setConfig("network.unit.definition", networkDefinition);
@ -821,34 +828,36 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE); final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1); final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0); final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB(
this.networkRoot,
"seed.new.heap",
"seed.old.heap",
"seed.pot.heap",
mySeedFile,
redundancy,
partitionExponent);
try { try {
String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""); indexSegment = new Segment(
final File networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK"); log,
networkRoot.mkdirs(); new File(new File(indexPrimaryPath, networkName), "TEXT"),
final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED); wordCacheMaxCount,
peers = new yacySeedDB( fileSizeMax);
networkRoot, } catch (IOException e) {
"seed.new.heap", e.printStackTrace();
"seed.old.heap", }
"seed.pot.heap", crawler = new CrawlSwitchboard(
mySeedFile, peers,
redundancy, networkName,
partitionExponent); log,
indexSegment = new Segment( this.queuesRoot);
log,
new File(new File(indexPrimaryPath, networkName), "TEXT"), // we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
wordCacheMaxCount,
fileSizeMax);
crawler = new CrawlSwitchboard(
peers,
networkName,
log,
new File(new File(indexPrimaryPath, networkName), "QUEUES"));
} catch (IOException e) {
e.printStackTrace();
this.indexSegment = null;
}
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
this.crawlStacker = new CrawlStacker( this.crawlStacker = new CrawlStacker(
this.crawlQueues, this.crawlQueues,
this.crawler, this.crawler,
@ -856,6 +865,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.peers, this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0, "local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0); "global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
// create new web structure
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
// load the robots.txt database
this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(this.queuesRoot, "crawlRobotsTxt.heap");
this.robots = new RobotsTxt(robotsDBFile);
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" +
", " + ppRamString(robotsDBFile.length()/1024));
// start a loader
log.logConfig("Starting Crawl Loader");
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
} }
// start up crawl jobs // start up crawl jobs
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -321,13 +321,7 @@ public final class plasmaSwitchboardConstants {
public static final String LIST_BLUE_DEFAULT = null; public static final String LIST_BLUE_DEFAULT = null;
public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords"; public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords";
public static final String LIST_STOPWORDS_DEFAULT = "yacy.stopwords"; public static final String LIST_STOPWORDS_DEFAULT = "yacy.stopwords";
/**
* <p><code>public static final String <strong>DBPATH</strong> = "dbPath"</code></p>
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
* databases containing queues are stored</p>
*/
public static final String PLASMA_PATH = "dbPath";
public static final String PLASMA_PATH_DEFAULT = "DATA/PLASMADB";
/** /**
* <p><code>public static final String <strong>HTCACHE_PATH</strong> = "proxyCache"</code></p> * <p><code>public static final String <strong>HTCACHE_PATH</strong> = "proxyCache"</code></p>
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all

@ -628,15 +628,13 @@ public final class yacy {
System.out.println(copyright); System.out.println(copyright);
System.out.println(hline); System.out.println(hline);
final Properties config = configuration("GEN-WORDSTAT", homePath);
// load words // load words
Log.logInfo("GEN-WORDSTAT", "loading words..."); Log.logInfo("GEN-WORDSTAT", "loading words...");
final TreeMap<byte[], String> words = loadWordMap(new File(homePath, "yacy.words")); final TreeMap<byte[], String> words = loadWordMap(new File(homePath, "yacy.words"));
// find all hashes // find all hashes
Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases..."); Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
final File dbRoot = new File(homePath, config.getProperty("dbPath")); final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/");
final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true); final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
File f; File f;
byte[] h; byte[] h;

Loading…
Cancel
Save