Integrated new indexing data structure 'collections' into the main class

for indexing, the plasmaWordIndex. The new data structure is ready-to-use, but currently disabled. It can be activated by setting the static plasmaWordIndex.useCollectionIndex to true. This shall be done for testing purpose. The new index is stored to DATA/INDEX/PUBLIC/TEXT The directory PLASMA shall be used only for crawler in the future. Attention: during testing the data structure in INDEX may change, and created indexes with the new data structure may get useless. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2348 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 279b1d969d
parent 4ff742e42d
commit 279b1d969d
13 changed files with 161 additions and 61 deletions
--- a/htroot/IndexImport_p.java
+++ b/htroot/IndexImport_p.java
@ -97,7 +97,7 @@ public final class IndexImport_p {
                    if (startImport) {
                        dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
                        if (importerThread != null) {
-                            importerThread.init(new File(importPath), cacheSize, 100);
+                            importerThread.init(new File(importPath), switchboard.indexPublicTextPath, cacheSize, 100);
                            importerThread.startIt();                            
                        }
                        prop.put("LOCATION","");
--- a/source/de/anomic/index/indexCollectionRI.java
+++ b/source/de/anomic/index/indexCollectionRI.java
@ -38,18 +38,27 @@ import de.anomic.kelondro.kelondroOutOfLimitsException;
 import de.anomic.kelondro.kelondroRow;
 import de.anomic.kelondro.kelondroRowCollection;
 import de.anomic.kelondro.kelondroRowSet;
+import de.anomic.server.logging.serverLog;

 public class indexCollectionRI extends indexAbstractRI implements indexRI {

    kelondroCollectionIndex collectionIndex;
    
-    public indexCollectionRI(File path, String filenameStub, long buffersize, long preloadTime) throws IOException {
-        kelondroRow rowdef = new kelondroRow(new int[]{});
-        
-        collectionIndex = new kelondroCollectionIndex(
-                path, filenameStub, 9 /*keyLength*/,
-                kelondroNaturalOrder.naturalOrder, buffersize, preloadTime,
-                4 /*loadfactor*/, rowdef);
+    public indexCollectionRI(File path, String filenameStub, long buffersize, long preloadTime) {
+        kelondroRow rowdef = indexURLEntry.urlEntryRow;
+        try {
+            collectionIndex = new kelondroCollectionIndex(
+                    path,
+                    filenameStub,
+                    12 /*keyLength*/,
+                    kelondroNaturalOrder.naturalOrder,
+                    buffersize,
+                    preloadTime,
+                    4 /*loadfactor*/,
+                    rowdef);
+        } catch (IOException e) {
+            serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage());
+        }
    }
    
    public int size() {
@ -133,7 +142,7 @@ public class indexCollectionRI extends indexAbstractRI implements indexRI {
        String wordHash = newEntries.getWordHash();
        try {
            collectionIndex.merge(wordHash.getBytes(), (kelondroRowCollection) newEntries);
-            return getContainer(wordHash, true, -1); // FIXME: this is not optimal
+            return null; // merge does allways 'eat' up all entries unlike the assortments; they may return an overflow container
        } catch (kelondroOutOfLimitsException e) {
            e.printStackTrace();
            return null;
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@ -55,7 +55,7 @@ public class kelondroCollectionIndex {
            "int chunksize-4 {b256}," +
            "int chunkcount-4 {b256}," +
            "int indexpos-4 {b256}," +
-            "short lastread-2 {b256}" +
+            "short lastread-2 {b256}, " +
            "short lastwrote-2 {b256}"
            );
    }
@ -157,7 +157,7 @@ public class kelondroCollectionIndex {
    private int putmergeremove(byte[] key, kelondroRowCollection collection, boolean merge, Set removekeys, boolean deletecomplete) throws IOException, kelondroOutOfLimitsException {
        //if (collection.size() > maxChunks) throw new kelondroOutOfLimitsException(maxChunks, collection.size());

-        if ((!merge) && (collection.size() == 0)) {
+        if ((!merge) && (removekeys != null) && (collection != null) && (collection.size() == 0)) {
            // this is not a replacement, it is a deletion
            delete(key);
            return 0;
--- a/source/de/anomic/plasma/dbImport/AbstractImporter.java
+++ b/source/de/anomic/plasma/dbImport/AbstractImporter.java
@ -14,7 +14,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
    protected boolean paused = false;
    
    protected plasmaSwitchboard sb;
-    protected File importPath;
+    protected File importPath, indexPath;
    protected int cacheSize;
    protected long preloadTime;
    
@ -33,9 +33,10 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
        return this.error;
    }    
    
-    public void init(File theImportPath) {
+    public void init(File theImportPath, File theIndexPath) {
        if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
-        this.importPath = theImportPath;      
+        this.importPath = theImportPath;
+        this.indexPath = theIndexPath;
        
        // getting a job id from the import manager
        this.jobID = this.sb.dbImportManager.getJobID();
--- a/source/de/anomic/plasma/dbImport/AssortmentImporter.java
+++ b/source/de/anomic/plasma/dbImport/AssortmentImporter.java
@ -21,8 +21,8 @@ public class AssortmentImporter extends AbstractImporter implements dbImporter{
        this.jobType = "ASSORTMENT";
    }
    
-    public void init(File theImportAssortmentFile, int theCacheSize, long preloadTime) {
-        super.init(theImportAssortmentFile);
+    public void init(File theImportAssortmentFile, File theIndexFile, int theCacheSize, long preloadTime) {
+        super.init(theImportAssortmentFile, theIndexFile);
        this.importAssortmentFile = theImportAssortmentFile;
        this.cacheSize = theCacheSize;
        if (this.cacheSize < 2*1024*1024) this.cacheSize = 2*1024*1024;
--- a/source/de/anomic/plasma/dbImport/dbImporter.java
+++ b/source/de/anomic/plasma/dbImport/dbImporter.java
@ -24,6 +24,6 @@ public interface dbImporter {
    public String getError();
    public String getStatus();
    
-    public void init(File importPath, int cacheSize, long preloadTime);
+    public void init(File importPath, File indexPath, int cacheSize, long preloadTime);
    public void startIt();    
 }
--- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java
@ -45,8 +45,8 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
        return theStatus.toString();
    }

-    public void init(File theImportPath, int theCacheSize, long preloadTime) {
-        super.init(theImportPath);
+    public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
+        super.init(theImportPath, theIndexPath);
        this.cacheSize = theCacheSize;
        this.preloadTime = preloadTime;
        
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@ -51,9 +51,9 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
        return theStatus.toString();
    }
    
-    public void init(File theImportPath, int theCacheSize, long preloadTime) {
-        super.init(theImportPath);
-            
+    public void init(File theImportPath, File theIndexPath, int theCacheSize, long preloadTime) {
+        super.init(theImportPath, theIndexPath);
+
        this.homeWordIndex = this.sb.wordIndex;
        this.homeUrlDB = this.sb.urlPool.loadedURL;
        this.cacheSize = theCacheSize;
@ -75,7 +75,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
        }         
        
        this.log.logFine("Initializing source word index db.");
-        this.importWordIndex = new plasmaWordIndex(this.importPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log);
+        this.importWordIndex = new plasmaWordIndex(this.importPath, this.indexPath, (this.cacheSize/2)/1024, preloadTime / 2, this.log);
        this.log.logFine("Initializing import URL db.");
        this.importUrlDB = new plasmaCrawlLURL(new File(this.importPath, "urlHash.db"), (this.cacheSize/2)/1024, preloadTime / 2);
        this.importStartSize = this.importWordIndex.size();
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@ -243,7 +243,7 @@ public class plasmaDHTChunk {
            }
            // create result
            indexContainers = (indexContainer[]) tmpContainers.toArray(new indexContainer[tmpContainers.size()]);
-
+//[C[16GwGuFzwffp] has 1 entries, C[16hGKMAl0w97] has 9 entries, C[17A8cDPF6SfG] has 9 entries, C[17Kdj__WWnUy] has 1 entries, C[1
            if ((indexContainers == null) || (indexContainers.length == 0)) {
                log.logFine("No index available for index transfer, hash start-point " + startPointHash);
                this.status = chunkStatus_FAILED;
@ -269,7 +269,7 @@ public class plasmaDHTChunk {
    }
    
    
-    public int deleteTransferIndexes() {
+    public synchronized int deleteTransferIndexes() {
        Iterator urlIter;
        indexEntry iEntry;
        HashSet urlHashes;
@ -277,6 +277,10 @@ public class plasmaDHTChunk {
        
        for (int i = 0; i < this.indexContainers.length; i++) {
            // delete entries separately
+            if (this.indexContainers[i] == null) {
+                log.logFine("Deletion of partial index #" + i + " not possible, entry is null");
+                continue;
+            }
            int c = this.indexContainers[i].size();
            urlHashes = new HashSet(this.indexContainers[i].size());
            urlIter = this.indexContainers[i].entries();
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -180,6 +180,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    // storage management
    public  File                        htCachePath;
    private File                        plasmaPath;
+    public  File                        indexPublicTextPath;
    public  File                        listsPath;
    public  File                        htDocsPath;
    public  File                        rankingPath;
@ -260,6 +261,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        // load values from configs
        this.plasmaPath   = new File(rootPath, getConfig("dbPath", "DATA/PLASMADB"));
        this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
+        this.indexPublicTextPath = new File(rootPath, getConfig("indexPublicTextPath", "DATA/INDEX/PUBLIC/TEXT"));
+        this.log.logConfig("Index Path: " + this.indexPublicTextPath.toString());
        this.listsPath      = new File(rootPath, getConfig("listsPath", "DATA/LISTS"));
        this.log.logConfig("Lists Path:     " + this.listsPath.toString());
        this.htDocsPath   = new File(rootPath, getConfig("htDocsPath", "DATA/HTDOCS"));
@ -386,7 +389,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        log.logConfig("Starting Indexing Management");
        urlPool = new plasmaURLPool(plasmaPath, ramLURL, ramNURL, ramEURL, ramLURL_time);
        
-        wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, ramRWI_time, log);
+        wordIndex = new plasmaWordIndex(plasmaPath, indexPublicTextPath, ramRWI, ramRWI_time, log);
        int wordCacheMaxCount = (int) getConfigLong("wordCacheMaxCount", 10000);
        wordIndex.setMaxWordCount(wordCacheMaxCount);
        
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -58,6 +58,7 @@ import java.util.TreeSet;
 import de.anomic.net.URL;

 import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.index.indexCollectionRI;
 import de.anomic.index.indexContainer;
 import de.anomic.index.indexContainerOrder;
 import de.anomic.index.indexEntry;
@ -78,28 +79,37 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {

    private static final String indexAssortmentClusterPath = "ACLUSTER";
    private static final int assortmentCount = 64;
+    private static final boolean useCollectionIndex = false;
    
-    private final File databaseRoot;
-    private final indexRAMCacheRI ramCache;
-    private final plasmaWordIndexAssortmentCluster assortmentCluster;
-    private int assortmentBufferSize; //kb
-    private final plasmaWordIndexFileCluster backend;    
-    private final kelondroOrder indexOrder = new kelondroNaturalOrder(true);
+    private final File                             oldDatabaseRoot;
+    private final kelondroOrder                    indexOrder = new kelondroNaturalOrder(true);
+    private final indexRAMCacheRI                  ramCache;
+    private final indexCollectionRI                collections;          // new database structure to replace AssortmentCluster and FileCluster
+    private int                                    assortmentBufferSize; // kb
+    private final plasmaWordIndexAssortmentCluster assortmentCluster;    // old database structure, to be replaced by CollectionRI
+    private final plasmaWordIndexFileCluster       backend;              // old database structure, to be replaced by CollectionRI
    
-    public plasmaWordIndex(File databaseRoot, int bufferkb, long preloadTime, serverLog log) {
-        this.databaseRoot = databaseRoot;
-        this.backend = new plasmaWordIndexFileCluster(databaseRoot, log);
-        this.ramCache = new indexRAMCacheRI(databaseRoot, log);
+    public plasmaWordIndex(File oldDatabaseRoot, File newIndexRoot, int bufferkb, long preloadTime, serverLog log) {
+        this.oldDatabaseRoot = oldDatabaseRoot;
+        this.backend = new plasmaWordIndexFileCluster(oldDatabaseRoot, log);
+        this.ramCache = new indexRAMCacheRI(oldDatabaseRoot, log);

-        // create new assortment cluster path
-        File assortmentClusterPath = new File(databaseRoot, indexAssortmentClusterPath);
+        // create assortment cluster path
+        File assortmentClusterPath = new File(oldDatabaseRoot, indexAssortmentClusterPath);
        if (!(assortmentClusterPath.exists())) assortmentClusterPath.mkdirs();
        this.assortmentBufferSize = bufferkb;
        this.assortmentCluster = new plasmaWordIndexAssortmentCluster(assortmentClusterPath, assortmentCount, assortmentBufferSize, preloadTime, log);
+        
+        // create collections storage path
+        if (!(newIndexRoot.exists())) newIndexRoot.mkdirs();
+        if (useCollectionIndex)
+            collections = new indexCollectionRI(newIndexRoot, "test_generation0", bufferkb * 1024, preloadTime);
+        else
+            collections = null;
    }

    public File getRoot() {
-        return databaseRoot;
+        return oldDatabaseRoot;
    }

    public int maxURLinWCache() {
@ -203,9 +213,16 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    private synchronized void flushCache(String wordHash) {
        indexContainer c = ramCache.deleteContainer(wordHash);
        if (c != null) {
-            indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false);
-            if (feedback != null) {
-                backend.addEntries(feedback, System.currentTimeMillis(), true);
+            if (useCollectionIndex) {
+                indexContainer feedback = collections.addEntries(c, c.updated(), false);
+                if (feedback != null) {
+                    throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString());
+                }
+            } else {
+                indexContainer feedback = assortmentCluster.addEntries(c, c.updated(), false);
+                if (feedback != null) {
+                    backend.addEntries(feedback, System.currentTimeMillis(), true);
+                }
            }
        }
    }
@ -292,15 +309,25 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
        // get from cache
        indexContainer container = ramCache.getContainer(wordHash, true, -1);

+        // We must not use the container from cache to store everything we find,
+        // as that container remains linked to in the cache and might be changed later
+        // while the returned container is still in use.
+        // create a clone from the container
+        if (container != null) container = container.topLevelClone();
+        
+        // get from collection index
+        if (useCollectionIndex) {
+            if (container == null) {
+                container = collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
+            } else {
+                container.add(collections.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
+            }
+        }
+        
        // get from assortments
        if (container == null) {
            container = assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime);
        } else {
-            // We must not use the container from cache to store everything we find,
-            // as that container remains linked to in the cache and might be changed later
-            // while the returned container is still in use.
-            // create a clone from the container
-            container = container.topLevelClone();
            // add containers from assortment cluster
            container.add(assortmentCluster.getContainer(wordHash, true, (maxTime < 0) ? -1 : maxTime), -1);
        }
@ -357,6 +384,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
                entity.close();
            }
        } catch (IOException e) {}
+        if (useCollectionIndex) size += collections.size();
        size += assortmentCluster.indexSize(wordHash);
        size += ramCache.indexSize(wordHash);
        return size;
@ -364,6 +392,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {

    public synchronized void close(int waitingBoundSeconds) {
        ramCache.close(waitingBoundSeconds);
+        if (useCollectionIndex) collections.close(-1);
        assortmentCluster.close(-1);
        backend.close(10);
    }
@ -371,7 +400,8 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    public synchronized indexContainer deleteContainer(String wordHash) {
        indexContainer c = ramCache.deleteContainer(wordHash);
        if (c == null) c = new indexRowSetContainer(wordHash);
-        c.add(assortmentCluster.deleteContainer(wordHash, -1), -1);
+        if (useCollectionIndex) c.add(collections.deleteContainer(wordHash), -1);
+        c.add(assortmentCluster.deleteContainer(wordHash), -1);
        c.add(backend.deleteContainer(wordHash), -1);
        return c;
    }
@ -379,6 +409,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) {
        synchronized (this) {
            if (ramCache.removeEntry(wordHash, urlHash, deleteComplete)) return true;
+            if (useCollectionIndex) {if (collections.removeEntry(wordHash, urlHash, deleteComplete)) return true;}
            if (assortmentCluster.removeEntry(wordHash, urlHash, deleteComplete)) return true;
            return backend.removeEntry(wordHash, urlHash, deleteComplete);
        }
@ -389,6 +420,10 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
        synchronized (this) {
            removed += ramCache.removeEntries(wordHash, urlHashes, deleteComplete);
            if (removed == urlHashes.size()) return removed;
+            if (useCollectionIndex) {
+                removed += collections.removeEntries(wordHash, urlHashes, deleteComplete);
+                if (removed == urlHashes.size()) return removed;
+            }
            removed += assortmentCluster.removeEntries(wordHash, urlHashes, deleteComplete);
            if (removed == urlHashes.size()) return removed;
            removed += backend.removeEntries(wordHash, urlHashes, deleteComplete);
@ -405,9 +440,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    }
    
    public static final int RL_RAMCACHE    = 0;
-    public static final int RL_COLLECTIONS = 1; // the 'new' index structure
-    public static final int RL_ASSORTMENTS = 2;
-    public static final int RL_WORDFILES   = 3;
+    public static final int RL_COLLECTIONS = 1; // the new index structure
+    public static final int RL_ASSORTMENTS = 2; // (to be) outdated structure
+    public static final int RL_WORDFILES   = 3; // (to be) outdated structure
    

    public synchronized TreeSet indexContainerSet(String startHash, int resourceLevel, boolean rot, int count) throws IOException {
@ -446,16 +481,56 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
        if (resourceLevel == plasmaWordIndex.RL_RAMCACHE) {
            return ramCache.wordContainers(startWordHash, false);
        }
-        if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) {
+        if ((resourceLevel == plasmaWordIndex.RL_COLLECTIONS) && (useCollectionIndex)) {
            return new kelondroMergeIterator(
+                            ramCache.wordContainers(startWordHash, false),
+                            collections.wordContainers(startWordHash, false),
+                            new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                            indexRowSetContainer.containerMergeMethod,
+                            true);
+        }
+        if (resourceLevel == plasmaWordIndex.RL_ASSORTMENTS) {
+            if (useCollectionIndex) {
+                return new kelondroMergeIterator(
+                        new kelondroMergeIterator(
+                                 ramCache.wordContainers(startWordHash, false),
+                                 collections.wordContainers(startWordHash, false),
+                                 new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                                 indexRowSetContainer.containerMergeMethod,
+                                 true),
+                        assortmentCluster.wordContainers(startWordHash, true, false),
+                        new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                        indexRowSetContainer.containerMergeMethod,
+                        true);
+            } else {
+                return new kelondroMergeIterator(
                            ramCache.wordContainers(startWordHash, false),
                            assortmentCluster.wordContainers(startWordHash, true, false),
                            new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
                            indexRowSetContainer.containerMergeMethod,
                            true);
+            }
        }
        if (resourceLevel == plasmaWordIndex.RL_WORDFILES) {
-            return new kelondroMergeIterator(
+            if (useCollectionIndex) {
+                return new kelondroMergeIterator(
+                        new kelondroMergeIterator(
+                         new kelondroMergeIterator(
+                                 ramCache.wordContainers(startWordHash, false),
+                                 collections.wordContainers(startWordHash, false),
+                                 new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                                 indexRowSetContainer.containerMergeMethod,
+                                 true),
+                         assortmentCluster.wordContainers(startWordHash, true, false),
+                         new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                         indexRowSetContainer.containerMergeMethod,
+                         true),
+                        backend.wordContainers(startWordHash, false),
+                        new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
+                        indexRowSetContainer.containerMergeMethod,
+                        true);
+            } else {
+                return new kelondroMergeIterator(
                            new kelondroMergeIterator(
                                     ramCache.wordContainers(startWordHash, false),
                                     assortmentCluster.wordContainers(startWordHash, true, false),
@ -466,6 +541,7 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
                            new indexContainerOrder(kelondroNaturalOrder.naturalOrder),
                            indexRowSetContainer.containerMergeMethod,
                            true);
+            }
        }
        return null;
    }
@ -505,11 +581,11 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    public Object migrateWords2Assortment(String wordhash) throws IOException {
        // returns the number of entries that had been added to the assortments
        // can be negative if some assortments have been moved to the backend
-        File db = plasmaWordIndexFile.wordHash2path(databaseRoot, wordhash);
+        File db = plasmaWordIndexFile.wordHash2path(oldDatabaseRoot, wordhash);
        if (!(db.exists())) return "not available";
        plasmaWordIndexFile entity = null;
        try {
-            entity =  new plasmaWordIndexFile(databaseRoot, wordhash, true);
+            entity =  new plasmaWordIndexFile(oldDatabaseRoot, wordhash, true);
            int size = entity.size();
            if (size > assortmentCluster.clusterCapacity) {
                // this will be too big to integrate it
@ -671,8 +747,9 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
    public static void main(String[] args) {
        // System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
        // System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
-        
-        plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555, 1000, new serverLog("TESTAPP"));
+        File plasmadb = new File("D:\\dev\\proxy\\DATA\\PLASMADB");
+        File indexdb = new File("D:\\dev\\proxy\\DATA\\INDEX\\PRIVATE\\TEXT");
+        plasmaWordIndex index = new plasmaWordIndex(plasmadb, indexdb, 555, 1000, new serverLog("TESTAPP"));
        try {
            Iterator containerIter = index.wordContainers("5A8yhZMh_Kmv", plasmaWordIndex.RL_WORDFILES, true);
            while (containerIter.hasNext()) {
--- a/source/yacy.java
+++ b/source/yacy.java
@ -646,9 +646,10 @@ public final class yacy {
        // run with "java -classpath classes yacy -migratewords"
        try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
        File dbroot = new File(new File(homePath), "DATA/PLASMADB");
+        File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
        serverLog log = new serverLog("WORDMIGRATION");
        log.logInfo("STARTING MIGRATION");
-        plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, 20000, 10000, log);
+        plasmaWordIndex wordIndexCache = new plasmaWordIndex(dbroot, indexRoot, 20000, 10000, log);
        enumerateFiles words = new enumerateFiles(new File(dbroot, "WORDS"), true, false, true, true);
        String wordhash;
        File wordfile;
@ -686,6 +687,7 @@ public final class yacy {
        // run with "java -classpath classes yacy -minimizeUrlDB"
        try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
        File dbroot = new File(new File(homePath), "DATA/PLASMADB");
+        File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
        serverLog log = new serverLog("URL-CLEANUP");
        try {
            log.logInfo("STARTING URL CLEANUP");
@ -702,7 +704,7 @@ public final class yacy {
            int cacheMem = (int)((rt.maxMemory()-rt.totalMemory())/1024)-(2*cache + 8*1024);
            if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up.");
                
-            plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, cacheMem, 10000, log);
+            plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, indexRoot, cacheMem, 10000, log);
            Iterator indexContainerIterator = wordIndex.wordContainers("------------", plasmaWordIndex.RL_WORDFILES, false);
            
            long urlCounter = 0, wordCounter = 0;
@ -1137,6 +1139,7 @@ public final class yacy {
        plasmaWordIndex WordIndex = null;
        serverLog log = new serverLog("HASHLIST");
        File homeDBroot = new File(new File(homePath), "DATA/PLASMADB");
+        File indexRoot = new File(new File(homePath), "DATA/INDEX/PUBLIC/TEXT");
        String wordChunkStartHash = "------------";
        try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
        log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@ -1144,7 +1147,7 @@ public final class yacy {
        try {
            Iterator indexContainerIterator = null;
            if (resource.equals("all")) {
-                WordIndex = new plasmaWordIndex(homeDBroot, 8*1024*1024, 3000, log);
+                WordIndex = new plasmaWordIndex(homeDBroot, indexRoot, 8*1024*1024, 3000, log);
                indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
            } else if (resource.equals("assortments")) {
                plasmaWordIndexAssortmentCluster assortmentCluster = new plasmaWordIndexAssortmentCluster(new File(homeDBroot, "ACLUSTER"), 64, 16*1024*1024, 3000, log);
--- a/yacy.init
+++ b/yacy.init
@ -185,9 +185,12 @@ parseableExt=html,htm,txt,php,shtml,asp,aspx,jsp
 # other peer users
 promoteSearchPageGreeting =

-# the path to the PLASMA database, especially the reverse word index
+# the path to the PLASMA database of the web spider
 dbPath=DATA/PLASMADB

+# the path to the public reverse word index for text files (web pages)
+indexPublicTextPath=DATA/INDEX/PUBLIC/TEXT
+
 # the path to the LISTS files. Most lists are used to filter web content
 listsPath=DATA/LISTS