added option to configure a path to a secondary index location.

this shall be used to store a fragment of the index on another physical device, to split IO load and enhance access speed. The index is splitted in such a way that the LURLs are stored to the secondary location, and the RWIs to the primary location. This is especially useful for environments where symbolic links are not possible and may cause IO access even if there is no write access to the device which hosts the symbolic link. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3519 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 5c3afb3202
parent 07cd30cf9b
commit 5c3afb3202
10 changed files with 87 additions and 53 deletions
--- a/htroot/IndexImport_p.html
+++ b/htroot/IndexImport_p.html
@ -48,8 +48,18 @@
        </tr>
        <tr class="TableCellLight">
          <td title="Path to the PLASMADB directory of the foreign peer">Import&nbsp;Path:</td>
-          <td colspan="3"><input name="importPath" type="text" size="50" value="" /></td>
-          <td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
+          <td colspan="3"><input name="importPlasmaPath" type="text" size="50" value="" /></td>
+          <td></td>
+        </tr>
+        <tr class="TableCellLight">
+          <td title="Path to the primary Index directory of the foreign peer">Import&nbsp;Path:</td>
+          <td colspan="3"><input name="importIndexPrimaryPath" type="text" size="50" value="" /></td>
+          <td></td>
+        </tr>
+        <tr class="TableCellLight">
+          <td title="Path to the secondary Index directory of the foreign peer">Import&nbsp;Path:</td>
+          <td colspan="3"><input name="importIndexSecondaryPath" type="text" size="50" value="" /></td>
+          <td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
        </tr>
      </table>
      <p class="warning"><strong>Attention:</strong><br />Always do a backup of your source and destination database before starting to use this import function.</p>
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@ -73,7 +73,9 @@ public final class IndexImport_p {
            if (post.containsKey("startIndexDbImport")) {
                try {
                    // getting the import path
-                    String importPath = (String) post.get("importPath");
+                    String importPlasmaPath = (String) post.get("importPlasmaPath");
+                    String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath");
+                    String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
                    String importType = (String) post.get("importType");
                    String cacheSizeStr = (String) post.get("cacheSize");
                    int cacheSize = 8*1024*1024;
@ -98,7 +100,7 @@ public final class IndexImport_p {
                    if (startImport) {
                        dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
                        if (importerThread != null) {
-                            importerThread.init(new File(importPath), cacheSize, 100);
+                            importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
                            importerThread.startIt();                            
                        }
                        prop.put("LOCATION","");
--- a/source/de/anomic/plasma/dbImport/AbstractImporter.java
+++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java
@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
    protected boolean stopped = false;
    protected boolean paused = false;
    
-    protected File importPath;
+    protected File importPrimaryPath, importSecondaryPath;
    protected int cacheSize;
    protected long preloadTime;
    
@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
        return this.error;
    }    
    
-    public void init(File theImportPath) {
-        if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
-        this.importPath = theImportPath;
+    public void init(File thePrimaryPath, File theSecondaryPath) {
+        if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null.");
+        if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null.");
+        this.importPrimaryPath = thePrimaryPath;
+        this.importSecondaryPath = theSecondaryPath;
        
        // getting a job id from the import manager
        //this.jobID = this.sb.dbImportManager.getJobID();
@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
        return this.jobType;
    }
    
-    public File getImportPath() {
-        return this.importPath;
+    public File getPrimaryImportPath() {
+        return this.importPrimaryPath;
+    }
+    public File getSecondaryImportPath() {
+        return this.importSecondaryPath;
    }
    
    public abstract long getEstimatedTime();
--- a/source/de/anomic/plasma/dbImport/dbImporter.java
+++ b/source/de/anomic/plasma/dbImport/dbImporter.java
@ -20,10 +20,11 @@ public interface dbImporter {
    public int getJobID();
    public String getJobName();
    public String getJobType();
-    public File getImportPath();
+    public File getPrimaryImportPath();
+    public File getSecondaryImportPath();
    public String getError();
    public String getStatus();
    
-    public void init(File indexPath, int cacheSize, long preloadTime);
+    public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
    public void startIt();    
 }
--- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
    }

    public String getJobName() {
-        return this.importPath.toString();
+        return this.importPrimaryPath.toString();
    }

    public int getProcessingStatusPercent() {
@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
        return theStatus.toString();
    }

-    public void init(File theImportPath, int theCacheSize, long preloadTime) {
-        super.init(theImportPath);
+    public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) {
+        super.init(indexPrimary, indexSecondary);
        this.cacheSize = theCacheSize;
        this.preloadTime = preloadTime;
        
-        File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db");
-        File profileDbFile = new File(this.importPath, "crawlProfiles0.db");
+        File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
+        File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");
        
        String errorMsg = null;
-        if (!this.importPath.exists()) 
-            errorMsg = "The import path '" + this.importPath + "' does not exist.";
-        else if (!this.importPath.isDirectory()) 
-            errorMsg = "The import path '" + this.importPath + "' is not a directory.";
-        else if (!this.importPath.canRead()) 
-            errorMsg = "The import path '" + this.importPath + "' is not readable.";
-        else if (!this.importPath.canWrite()) 
-            errorMsg = "The import path '" + this.importPath + "' is not writeable.";
+        if (!plasmaPath.exists()) 
+            errorMsg = "The import path '" + plasmaPath+ "' does not exist.";
+        else if (!plasmaPath.isDirectory()) 
+            errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
+        else if (!plasmaPath.canRead()) 
+            errorMsg = "The import path '" + plasmaPath + "' is not readable.";
+        else if (!plasmaPath.canWrite()) 
+            errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
        
        else if (!noticeUrlDbFile.exists()) 
            errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
        
        // init noticeUrlDB
        this.log.logInfo("Initializing the source noticeUrlDB");
-        this.importNurlDB = new plasmaCrawlNURL(this.importPath);
+        this.importNurlDB = new plasmaCrawlNURL(plasmaPath);
        this.importStartSize = this.importNurlDB.size();
        //int stackSize = this.importNurlDB.stackSize();
        
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
    }
    
    public String getJobName() {
-        return this.importPath.toString();
+        return this.importPrimaryPath.toString();
    }

    public String getStatus() {
@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
        return theStatus.toString();
    }
    
-    public void init(File theImportPath, int theCacheSize, long preloadTime) {
-        super.init(theImportPath);
+    public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
+        super.init(thePrimaryPath, theSecondaryPath);

        this.cacheSize = theCacheSize;
        if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
        
        // configure import DB
        String errorMsg = null;
-        if (!this.importPath.exists()) errorMsg = "Import directory does not exist.";
-        if (!this.importPath.canRead()) errorMsg = "Import directory is not readable.";
-        if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable";
-        if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory.";
+        if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
+        if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable.";
+        if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable";
+        if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory.";
        if (errorMsg != null) {
-            this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath());
+            this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath());
            throw new IllegalArgumentException(errorMsg);
-        }         
+        }
+        if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist.";
+        if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable.";
+        if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable";
+        if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory.";
+        if (errorMsg != null) {
+            this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath());
+            throw new IllegalArgumentException(errorMsg);
+        }
        
        this.log.logFine("Initializing source word index db.");
-        this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log);
+        this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log);

        this.importStartSize = this.importWordIndex.size();
    }
@ -93,8 +101,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
    public void importWordsDB() {
        this.log.logInfo("STARTING DB-IMPORT");  
        
-        try {                                                
-            this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'");
+        try {
+            this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
            this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
            this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");                        
            
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    // storage management
    public  File                        htCachePath;
    private File                        plasmaPath;
-    public  File                        indexPath;
+    public  File                        indexPrimaryPath, indexSecondaryPath;
    public  File                        listsPath;
    public  File                        htDocsPath;
    public  File                        rankingPath;
@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
     * <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the
     * whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored</p>
     */
-    public static final String INDEX_PATH               = "indexPath";
+    public static final String INDEX_PRIMARY_PATH       = "indexPrimaryPath"; // this is a relative path to the data root
+    public static final String INDEX_SECONDARY_PATH     = "indexSecondaryPath"; // this is a absolute path to any location
    public static final String INDEX_PATH_DEFAULT       = "DATA/INDEX";
    /**
     * <p><code>public static final String <strong>LISTS_PATH</strong> = "listsPath"</code></p>
@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        // load values from configs
        this.plasmaPath   = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT));
        this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
-        this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT));
-        this.log.logConfig("Index Path: " + this.indexPath.toString());
+        this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT));
+        this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString());
+        this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
+        this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString());
        this.listsPath      = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT));
        this.log.logConfig("Lists Path:     " + this.listsPath.toString());
        this.htDocsPath   = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT));
@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        noticeURL = new plasmaCrawlNURL(plasmaPath);
        errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
        delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
-        wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log);
+        wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log);
        
        // set a high maximum cache size to current size; this is adopted later automatically
        int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000),
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI {
    private       int                flushsize;
    public  final plasmaCrawlLURL    loadedURL;
    
-    public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) {
-        File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE");
+    public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) {
+        File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE");
        if (!(textindexcache.exists())) textindexcache.mkdirs();
        this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log);
        this.dhtInCache  = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log);
        
        // create collections storage path
-        File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION");
+        File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION");
        if (!(textindexcollections.exists())) textindexcollections.mkdirs();
        this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow);

        // create LURL-db
-        loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime);
+        loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime);
        
        // performance settings
        busyCacheFlush = false;
--- a/source/yacy.java
+++ b/source/yacy.java
@ -616,14 +616,15 @@ public final class yacy {
    public static void minimizeUrlDB(String homePath) {
        // run with "java -classpath classes yacy -minimizeUrlDB"
        try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
-        File indexRoot = new File(new File(homePath), "DATA/INDEX");
+        File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
+        File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
        File indexRoot2 = new File(new File(homePath), "DATA/INDEX2");
        serverLog log = new serverLog("URL-CLEANUP");
        try {
            log.logInfo("STARTING URL CLEANUP");
            
            // db containing all currently loades urls
-            plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000);
+            plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000);
            
            // db used to hold all neede urls
            plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000);
@ -632,7 +633,7 @@ public final class yacy {
            int cacheMem = (int)(serverMemory.max-rt.totalMemory());
            if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
                
-            plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log);
+            plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log);
            Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
            
            long urlCounter = 0, wordCounter = 0;
@ -1000,7 +1001,8 @@ public final class yacy {
    private static void RWIHashList(String homePath, String targetName, String resource, String format) {
        plasmaWordIndex WordIndex = null;
        serverLog log = new serverLog("HASHLIST");
-        File indexRoot = new File(new File(homePath), "DATA/INDEX");
+        File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
+        File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
        String wordChunkStartHash = "AAAAAAAAAAAA";
        try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
        log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@ -1008,7 +1010,7 @@ public final class yacy {
        try {
            Iterator indexContainerIterator = null;
            if (resource.equals("all")) {
-                WordIndex = new plasmaWordIndex(indexRoot, 3000, log);
+                WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log);
                indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
            }
            int counter = 0;
--- a/yacy.init
+++ b/yacy.init
@ -198,7 +198,10 @@ promoteSearchPageGreeting =
 dbPath=DATA/PLASMADB

 # the path to the public reverse word index for text files (web pages)
-indexPath=DATA/INDEX
+# the primary path is relative to the data root, the secondary path is an absolute path
+# when the secondary path should be equal to the primary, it must be declared empty
+indexPrimaryPath=DATA/INDEX
+indexSecondaryPath=

 # the path to the LISTS files. Most lists are used to filter web content
 listsPath=DATA/LISTS