all stack/heap files that had been stored in DATA/PLASMA are now stored in the network-specific QUEUES path

There is no migration. All crawls must be restarted.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6167 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent aac89bf8ca
commit 995da28c73

@ -278,14 +278,10 @@ promoteSearchPageGreeting.homepage = http://yacy.net
promoteSearchPageGreeting.largeImage = /env/grafics/YaCyLogo_120ppi.png
promoteSearchPageGreeting.smallImage = /env/grafics/YaCyLogo_60ppi.png
# the path to the PLASMA database of the web spider
dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages)
# the primary path is relative to the data root, the secondary path is an absolute path
# when the secondary path should be equal to the primary, it must be declared empty
indexPrimaryPath=DATA/INDEX
indexSecondaryPath=
# the commons are words that appear in the index more than 64k times in references.
# Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking

@ -55,7 +55,11 @@ public final class IndexImport_p {
try {
final boolean startImport = true;
if (startImport) {
final Importer importerThread = new NoticeURLImporter(switchboard.plasmaPath, switchboard.crawlQueues, switchboard.crawler.profilesActiveCrawls, switchboard.dbImportManager);
final Importer importerThread = new NoticeURLImporter(
switchboard.queuesRoot,
switchboard.crawlQueues,
switchboard.crawler.profilesActiveCrawls,
switchboard.dbImportManager);
if (importerThread != null) {
importerThread.setJobID(switchboard.dbImportManager.generateUniqueJobID());

@ -242,7 +242,7 @@ public class CrawlQueues {
// check if the protocol is supported
final yacyURL url = urlEntry.url();
final String urlProtocol = url.getProtocol();
if (this.sb.crawlQueues.isSupportedProtocol(urlProtocol)) {
if (this.isSupportedProtocol(urlProtocol)) {
if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url()

@ -134,13 +134,7 @@ public final class CrawlSwitchboard {
", " + profilesPassiveFile.length()/1024);
// init queues
final File preStackFile = new File(queuesRoot, "urlNoticePreStack");
if (!preStackFile.exists()) {
// migrate old file
final File oldFile = new File(new File(queuesRoot.getParentFile().getParentFile().getParentFile(), "PLASMADB"), "switchboardQueue.stack");
if (oldFile.exists()) oldFile.renameTo(preStackFile);
}
this.queuePreStack = new IndexingStack(peers, preStackFile, this.profilesActiveCrawls);
this.queuePreStack = new IndexingStack(peers, new File(queuesRoot, "urlNoticePreStack.stack"), this.profilesActiveCrawls);
}
public void clear() {

@ -67,10 +67,10 @@ public final class ResourceObserver {
final ArrayList<String> pathsToCheck = new ArrayList<String>();
// FIXME whats about the secondary path???
// = (getConfig(plasmaSwitchboard.INDEX_SECONDARY_PATH, "");
final String[] pathes = {plasmaSwitchboardConstants.HTDOCS_PATH,
final String[] pathes = {
plasmaSwitchboardConstants.HTDOCS_PATH,
plasmaSwitchboardConstants.INDEX_PRIMARY_PATH,
plasmaSwitchboardConstants.LISTS_PATH,
plasmaSwitchboardConstants.PLASMA_PATH,
plasmaSwitchboardConstants.RANKING_PATH,
plasmaSwitchboardConstants.WORK_PATH};
String path;

@ -218,12 +218,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// storage management
public File htCachePath;
public File plasmaPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
public File workPath;
public File releasePath;
public File networkRoot;
public File queuesRoot;
public File surrogatesInPath;
public File surrogatesOutPath;
public Map<String, String> rankingPermissions;
@ -300,8 +301,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
httpRemoteProxyConfig.init(this);
// load values from configs
this.plasmaPath = getConfigPath(plasmaSwitchboardConstants.PLASMA_PATH, plasmaSwitchboardConstants.PLASMA_PATH_DEFAULT);
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
this.log.logConfig("Index Primary Path: " + indexPrimaryPath.toString());
this.listsPath = getConfigPath(plasmaSwitchboardConstants.LISTS_PATH, plasmaSwitchboardConstants.LISTS_PATH_DEFAULT);
@ -332,10 +331,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
try {
final File networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
networkRoot.mkdirs();
final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED);
final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB(
networkRoot,
"seed.new.heap",
@ -353,7 +354,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
peers,
networkName,
log,
new File(new File(indexPrimaryPath, networkName), "QUEUES"));
this.queuesRoot);
} catch (IOException e1) {
e1.printStackTrace();
indexSegment = null;
@ -381,7 +382,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.proxyLastAccess = System.currentTimeMillis() - 10000;
this.localSearchLastAccess = System.currentTimeMillis() - 10000;
this.remoteSearchLastAccess = System.currentTimeMillis() - 10000;
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(plasmaPath, "webStructure.map"));
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
// configuring list path
if (!(listsPath.exists())) listsPath.mkdirs();
@ -459,7 +460,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// loading the robots.txt db
this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(this.plasmaPath, "crawlRobotsTxt.heap");
final File robotsDBFile = new File(queuesRoot, "crawlRobotsTxt.heap");
robots = new RobotsTxt(robotsDBFile);
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" +
@ -527,7 +528,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// start a loader
log.logConfig("Starting Crawl Loader");
this.crawlQueues = new CrawlQueues(this, plasmaPath);
this.crawlQueues = new CrawlQueues(this, queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
@ -804,14 +805,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
proxyLastAccess = System.currentTimeMillis() + 10000; // at least 10 seconds online caution to prevent unnecessary action on database meanwhile
// clean search events which have cached relations to the old index
plasmaSearchEvent.cleanupEvents(true);
// switch the networks
synchronized (this) {
// shut down
synchronized (this.indexSegment) {
this.indexSegment.close();
}
this.crawlStacker.announceClose();
this.crawlStacker.close();
this.webStructure.close();
this.robots.close();
this.crawlQueues.close();
// start up
setConfig("network.unit.definition", networkDefinition);
@ -821,34 +828,36 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final long fileSizeMax = (serverSystem.isWindows) ? sb.getConfigLong("filesize.max.win", (long) Integer.MAX_VALUE) : sb.getConfigLong("filesize.max.other", (long) Integer.MAX_VALUE);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int partitionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
this.networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
this.queuesRoot = new File(new File(indexPrimaryPath, networkName), "QUEUES");
this.networkRoot.mkdirs();
this.queuesRoot.mkdirs();
final File mySeedFile = new File(this.networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB(
this.networkRoot,
"seed.new.heap",
"seed.old.heap",
"seed.pot.heap",
mySeedFile,
redundancy,
partitionExponent);
try {
String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
final File networkRoot = new File(new File(indexPrimaryPath, networkName), "NETWORK");
networkRoot.mkdirs();
final File mySeedFile = new File(networkRoot, yacySeedDB.DBFILE_OWN_SEED);
peers = new yacySeedDB(
networkRoot,
"seed.new.heap",
"seed.old.heap",
"seed.pot.heap",
mySeedFile,
redundancy,
partitionExponent);
indexSegment = new Segment(
log,
new File(new File(indexPrimaryPath, networkName), "TEXT"),
wordCacheMaxCount,
fileSizeMax);
crawler = new CrawlSwitchboard(
peers,
networkName,
log,
new File(new File(indexPrimaryPath, networkName), "QUEUES"));
} catch (IOException e) {
e.printStackTrace();
this.indexSegment = null;
}
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
indexSegment = new Segment(
log,
new File(new File(indexPrimaryPath, networkName), "TEXT"),
wordCacheMaxCount,
fileSizeMax);
} catch (IOException e) {
e.printStackTrace();
}
crawler = new CrawlSwitchboard(
peers,
networkName,
log,
this.queuesRoot);
// we need a new stacker, because this uses network-specific attributes to sort out urls (local, global)
this.crawlStacker = new CrawlStacker(
this.crawlQueues,
this.crawler,
@ -856,6 +865,25 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
this.peers,
"local.any".indexOf(getConfig("network.unit.domain", "global")) >= 0,
"global.any".indexOf(getConfig("network.unit.domain", "global")) >= 0);
// create new web structure
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(queuesRoot, "webStructure.map"));
// load the robots.txt database
this.log.logConfig("Initializing robots.txt DB");
final File robotsDBFile = new File(this.queuesRoot, "crawlRobotsTxt.heap");
this.robots = new RobotsTxt(robotsDBFile);
this.log.logConfig("Loaded robots.txt DB from file " + robotsDBFile.getName() +
", " + robots.size() + " entries" +
", " + ppRamString(robotsDBFile.length()/1024));
// start a loader
log.logConfig("Starting Crawl Loader");
this.crawlQueues = new CrawlQueues(this, this.queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
this.getConfigLong("minimumGlobalDelta", this.crawlQueues.noticeURL.getMinimumGlobalDelta()));
}
// start up crawl jobs
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -321,13 +321,7 @@ public final class plasmaSwitchboardConstants {
public static final String LIST_BLUE_DEFAULT = null;
public static final String LIST_BADWORDS_DEFAULT = "yacy.badwords";
public static final String LIST_STOPWORDS_DEFAULT = "yacy.stopwords";
/**
* <p><code>public static final String <strong>DBPATH</strong> = "dbPath"</code></p>
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all
* databases containing queues are stored</p>
*/
public static final String PLASMA_PATH = "dbPath";
public static final String PLASMA_PATH_DEFAULT = "DATA/PLASMADB";
/**
* <p><code>public static final String <strong>HTCACHE_PATH</strong> = "proxyCache"</code></p>
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where all

@ -628,15 +628,13 @@ public final class yacy {
System.out.println(copyright);
System.out.println(hline);
final Properties config = configuration("GEN-WORDSTAT", homePath);
// load words
Log.logInfo("GEN-WORDSTAT", "loading words...");
final TreeMap<byte[], String> words = loadWordMap(new File(homePath, "yacy.words"));
// find all hashes
Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
final File dbRoot = new File(homePath, config.getProperty("dbPath"));
final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/");
final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
File f;
byte[] h;

Loading…
Cancel
Save