From 5c3afb32028ac6cfef0ffd5dc71dd565ca143684 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sat, 24 Mar 2007 15:28:17 +0000 Subject: [PATCH] added option to configure a path to a secondary index location. this shall be used to store a fragment of the index on another physical device, to split IO load and enhance access speed. The index is splitted in such a way that the LURLs are stored to the secondary location, and the RWIs to the primary location. This is especially useful for environments where symbolic links are not possible and may cause IO access even if there is no write access to the device which hosts the symbolic link. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3519 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexImport_p.html | 14 ++++++-- htroot/IndexImport_p.java | 6 ++-- .../plasma/dbImport/AbstractImporter.java | 17 ++++++---- .../de/anomic/plasma/dbImport/dbImporter.java | 5 +-- .../dbImport/plasmaCrawlNURLImporter.java | 28 ++++++++-------- .../plasma/dbImport/plasmaDbImporter.java | 32 ++++++++++++------- .../de/anomic/plasma/plasmaSwitchboard.java | 13 +++++--- source/de/anomic/plasma/plasmaWordIndex.java | 8 ++--- source/yacy.java | 12 ++++--- yacy.init | 5 ++- 10 files changed, 87 insertions(+), 53 deletions(-) diff --git a/htroot/IndexImport_p.html b/htroot/IndexImport_p.html index 00c82f040..d5fc5227d 100644 --- a/htroot/IndexImport_p.html +++ b/htroot/IndexImport_p.html @@ -48,8 +48,18 @@ Import Path: - - + + + + + Import Path: + + + + + Import Path: + +

Attention:
Always do a backup of your source and destination database before starting to use this import function.

diff --git a/htroot/IndexImport_p.java b/htroot/IndexImport_p.java index 1045675ce..28405ab3c 100644 --- a/htroot/IndexImport_p.java +++ b/htroot/IndexImport_p.java @@ -73,7 +73,9 @@ public final class IndexImport_p { if (post.containsKey("startIndexDbImport")) { try { // getting the import path - String importPath = (String) post.get("importPath"); + String importPlasmaPath = (String) post.get("importPlasmaPath"); + String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath"); + String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath"); String importType = (String) post.get("importType"); String cacheSizeStr = (String) post.get("cacheSize"); int cacheSize = 8*1024*1024; @@ -98,7 +100,7 @@ public final class IndexImport_p { if (startImport) { dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType); if (importerThread != null) { - importerThread.init(new File(importPath), cacheSize, 100); + importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100); importerThread.startIt(); } prop.put("LOCATION",""); diff --git a/source/de/anomic/plasma/dbImport/AbstractImporter.java b/source/de/anomic/plasma/dbImport/AbstractImporter.java index 4dcdd8798..c0a07bbe1 100644 --- a/source/de/anomic/plasma/dbImport/AbstractImporter.java +++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java @@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ protected boolean stopped = false; protected boolean paused = false; - protected File importPath; + protected File importPrimaryPath, importSecondaryPath; protected int cacheSize; protected long preloadTime; @@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ return this.error; } - public void init(File theImportPath) { - if (theImportPath == null) throw new NullPointerException("The Import path must not be null."); - this.importPath = theImportPath; + public void init(File thePrimaryPath, File theSecondaryPath) { + if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null."); + if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null."); + this.importPrimaryPath = thePrimaryPath; + this.importSecondaryPath = theSecondaryPath; // getting a job id from the import manager //this.jobID = this.sb.dbImportManager.getJobID(); @@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{ return this.jobType; } - public File getImportPath() { - return this.importPath; + public File getPrimaryImportPath() { + return this.importPrimaryPath; + } + public File getSecondaryImportPath() { + return this.importSecondaryPath; } public abstract long getEstimatedTime(); diff --git a/source/de/anomic/plasma/dbImport/dbImporter.java b/source/de/anomic/plasma/dbImport/dbImporter.java index c141f68fc..fd551bffa 100644 --- a/source/de/anomic/plasma/dbImport/dbImporter.java +++ b/source/de/anomic/plasma/dbImport/dbImporter.java @@ -20,10 +20,11 @@ public interface dbImporter { public int getJobID(); public String getJobName(); public String getJobType(); - public File getImportPath(); + public File getPrimaryImportPath(); + public File getSecondaryImportPath(); public String getError(); public String getStatus(); - public void init(File indexPath, int cacheSize, long preloadTime); + public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime); public void startIt(); } diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 998c2aa0d..57c403b60 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor } public String getJobName() { - return this.importPath.toString(); + return this.importPrimaryPath.toString(); } public int getProcessingStatusPercent() { @@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); + public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) { + super.init(indexPrimary, indexSecondary); this.cacheSize = theCacheSize; this.preloadTime = preloadTime; - File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db"); - File profileDbFile = new File(this.importPath, "crawlProfiles0.db"); + File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db"); + File profileDbFile = new File(plasmaPath, "crawlProfiles0.db"); String errorMsg = null; - if (!this.importPath.exists()) - errorMsg = "The import path '" + this.importPath + "' does not exist."; - else if (!this.importPath.isDirectory()) - errorMsg = "The import path '" + this.importPath + "' is not a directory."; - else if (!this.importPath.canRead()) - errorMsg = "The import path '" + this.importPath + "' is not readable."; - else if (!this.importPath.canWrite()) - errorMsg = "The import path '" + this.importPath + "' is not writeable."; + if (!plasmaPath.exists()) + errorMsg = "The import path '" + plasmaPath+ "' does not exist."; + else if (!plasmaPath.isDirectory()) + errorMsg = "The import path '" + plasmaPath + "' is not a directory."; + else if (!plasmaPath.canRead()) + errorMsg = "The import path '" + plasmaPath + "' is not readable."; + else if (!plasmaPath.canWrite()) + errorMsg = "The import path '" + plasmaPath + "' is not writeable."; else if (!noticeUrlDbFile.exists()) errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist."; @@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // init noticeUrlDB this.log.logInfo("Initializing the source noticeUrlDB"); - this.importNurlDB = new plasmaCrawlNURL(this.importPath); + this.importNurlDB = new plasmaCrawlNURL(plasmaPath); this.importStartSize = this.importNurlDB.size(); //int stackSize = this.importNurlDB.stackSize(); diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 64564b7e3..28fbdd8f4 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } public String getJobName() { - return this.importPath.toString(); + return this.importPrimaryPath.toString(); } public String getStatus() { @@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { return theStatus.toString(); } - public void init(File theImportPath, int theCacheSize, long preloadTime) { - super.init(theImportPath); + public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) { + super.init(thePrimaryPath, theSecondaryPath); this.cacheSize = theCacheSize; if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024; // configure import DB String errorMsg = null; - if (!this.importPath.exists()) errorMsg = "Import directory does not exist."; - if (!this.importPath.canRead()) errorMsg = "Import directory is not readable."; - if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable"; - if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory."; + if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist."; + if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable."; + if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable"; + if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory."; if (errorMsg != null) { - this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath()); + this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath()); throw new IllegalArgumentException(errorMsg); - } + } + if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist."; + if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable."; + if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable"; + if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory."; + if (errorMsg != null) { + this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath()); + throw new IllegalArgumentException(errorMsg); + } this.log.logFine("Initializing source word index db."); - this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log); + this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log); this.importStartSize = this.importWordIndex.size(); } @@ -93,8 +101,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { public void importWordsDB() { this.log.logInfo("STARTING DB-IMPORT"); - try { - this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'"); + try { + this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'"); this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs."); this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs."); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index a503425d3..9d815bdd1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // storage management public File htCachePath; private File plasmaPath; - public File indexPath; + public File indexPrimaryPath, indexSecondaryPath; public File listsPath; public File htDocsPath; public File rankingPath; @@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser *

Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the * whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored

*/ - public static final String INDEX_PATH = "indexPath"; + public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root + public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location public static final String INDEX_PATH_DEFAULT = "DATA/INDEX"; /** *

public static final String LISTS_PATH = "listsPath"

@@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // load values from configs this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT)); this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString()); - this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT)); - this.log.logConfig("Index Path: " + this.indexPath.toString()); + this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT)); + this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString()); + this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, "")); + this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString()); this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT)); this.log.logConfig("Lists Path: " + this.listsPath.toString()); this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT)); @@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser noticeURL = new plasmaCrawlNURL(plasmaPath); errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db"); delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db"); - wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log); + wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log); // set a high maximum cache size to current size; this is adopted later automatically int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000), diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 94fe8bb6f..75492950d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI { private int flushsize; public final plasmaCrawlLURL loadedURL; - public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) { - File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE"); + public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) { + File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE"); if (!(textindexcache.exists())) textindexcache.mkdirs(); this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log); this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log); // create collections storage path - File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION"); + File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION"); if (!(textindexcollections.exists())) textindexcollections.mkdirs(); this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow); // create LURL-db - loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime); + loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime); // performance settings busyCacheFlush = false; diff --git a/source/yacy.java b/source/yacy.java index 19b3aaf58..e1b0f31ca 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -616,14 +616,15 @@ public final class yacy { public static void minimizeUrlDB(String homePath) { // run with "java -classpath classes yacy -minimizeUrlDB" try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} - File indexRoot = new File(new File(homePath), "DATA/INDEX"); + File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX"); + File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX"); File indexRoot2 = new File(new File(homePath), "DATA/INDEX2"); serverLog log = new serverLog("URL-CLEANUP"); try { log.logInfo("STARTING URL CLEANUP"); // db containing all currently loades urls - plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000); + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000); // db used to hold all neede urls plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000); @@ -632,7 +633,7 @@ public final class yacy { int cacheMem = (int)(serverMemory.max-rt.totalMemory()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log); + plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log); Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false); long urlCounter = 0, wordCounter = 0; @@ -1000,7 +1001,8 @@ public final class yacy { private static void RWIHashList(String homePath, String targetName, String resource, String format) { plasmaWordIndex WordIndex = null; serverLog log = new serverLog("HASHLIST"); - File indexRoot = new File(new File(homePath), "DATA/INDEX"); + File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX"); + File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX"); String wordChunkStartHash = "AAAAAAAAAAAA"; try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {} log.logInfo("STARTING CREATION OF RWI-HASHLIST"); @@ -1008,7 +1010,7 @@ public final class yacy { try { Iterator indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex(indexRoot, 3000, log); + WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false); } int counter = 0; diff --git a/yacy.init b/yacy.init index 1426e8fc8..8ab2b8b89 100644 --- a/yacy.init +++ b/yacy.init @@ -198,7 +198,10 @@ promoteSearchPageGreeting = dbPath=DATA/PLASMADB # the path to the public reverse word index for text files (web pages) -indexPath=DATA/INDEX +# the primary path is relative to the data root, the secondary path is an absolute path +# when the secondary path should be equal to the primary, it must be declared empty +indexPrimaryPath=DATA/INDEX +indexSecondaryPath= # the path to the LISTS files. Most lists are used to filter web content listsPath=DATA/LISTS