added property index.storeCommons to switch commons storage on or off

with index.storeCommons=false all currently stored commons are deleted! Default is now 'true', but in future full releases it will be switched to 'false' git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5315 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago · 22989d0d8a
parent 4b4ce75396
commit 22989d0d8a
9 changed files with 70 additions and 48 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -260,6 +260,13 @@ dbPath=DATA/PLASMADB
 indexPrimaryPath=DATA/INDEX
 indexSecondaryPath=
 # the commons are words that appear in the index more than 64k times in references.
 # Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking
 # are stored back into the index, and references with bad ranking are sorted out. Such sorted-out references can be stored
 # for later use (but there is no at this time). If the sorted-out references should be stored, the following property should be
 # set to true. If set to false, they are abandoned (deleted), and previously stored commons are removed.
 index.storeCommons=true
 # the path to the LISTS files. Most lists are used to filter web content
 listsPath=DATA/LISTS
--- a/source/de/anomic/index/indexCollectionRI.java
+++ b/source/de/anomic/index/indexCollectionRI.java
@ -44,7 +44,7 @@ public class indexCollectionRI implements indexRI {
    kelondroCollectionIndex collectionIndex;
-    public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow) {
+    public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow, boolean useCommons) {
        try {
            collectionIndex = new kelondroCollectionIndex(
                    path,
@ -53,7 +53,8 @@ public class indexCollectionRI implements indexRI {
                    kelondroBase64Order.enhancedCoder,
                    4 /*loadfactor*/,
                    maxpartition,
-                    payloadrow);
+                    payloadrow,
                    useCommons);
        } catch (final IOException e) {
            serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage());
        }
--- a/source/de/anomic/kelondro/kelondroCollectionIndex.java
+++ b/source/de/anomic/kelondro/kelondroCollectionIndex.java
@ -59,7 +59,7 @@ public class kelondroCollectionIndex {
    private final int           keylength;
    private final File          path;
    private final String        filenameStub;
-    private final File          commonsPath;
+    private final File          commonsPath1;
    private final int           loadfactor;
    private Map<String, kelondroFixedWidthArray> arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects
    private final kelondroRow   payloadrow; // definition of the payload (chunks inside the collections)
@ -113,7 +113,7 @@ public class kelondroCollectionIndex {
    }
    public kelondroCollectionIndex(final File path, final String filenameStub, final int keyLength, final kelondroByteOrder indexOrder,
-                                   final int loadfactor, final int maxpartitions, final kelondroRow rowdef) throws IOException {
+                                   final int loadfactor, final int maxpartitions, final kelondroRow rowdef, boolean useCommons) throws IOException {
        // the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree
        indexErrors = 0;
        this.path = path;
@ -122,8 +122,13 @@ public class kelondroCollectionIndex {
        this.payloadrow = rowdef;
        this.loadfactor = loadfactor;
        this.maxPartitions = maxpartitions;
-        this.commonsPath = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons");
+        File cop = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons");
-        this.commonsPath.mkdirs();
+        this.commonsPath1 = (useCommons) ? cop : null;
        if (this.commonsPath1 == null) {
            serverFileUtils.deleteDirectory(cop);
        } else {
            this.commonsPath1.mkdirs();
        }
        final File f = new File(path, filenameStub + ".index");
        if (f.exists()) {
@ -640,21 +645,22 @@ public class kelondroCollectionIndex {
        serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon);
        // finally dump the removed entries to a file
-        newcommon.sort();
+        if (commonsPath1 != null) {
-        final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
+            newcommon.sort();
-        formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
+            final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
-        final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection";
+            formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
-        final File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath
+            final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection";
-        storagePath.mkdirs();
+            final File storagePath = new File(commonsPath1, filename.substring(0, 2)); // make a subpath
-        final File file = new File(storagePath, filename);
+            storagePath.mkdirs();
-        try {
+            final File file = new File(storagePath, filename);
-            newcommon.saveCollection(file);
+            try {
-            serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
+                newcommon.saveCollection(file);
-        } catch (final IOException e) {
+                serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
-            e.printStackTrace();
+            } catch (final IOException e) {
-            serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
+                e.printStackTrace();
-        }
+                serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
-        
+            }
        }        
    }
    public synchronized int remove(final byte[] key, final Set<String> removekeys) throws IOException, kelondroOutOfLimitsException {
@ -885,7 +891,7 @@ public class kelondroCollectionIndex {
            final kelondroCollectionIndex collectionIndex  = new kelondroCollectionIndex(
                        path, filenameStub, 9 /*keyLength*/,
                        kelondroNaturalOrder.naturalOrder,
-                        4 /*loadfactor*/, 7, rowdef);
+                        4 /*loadfactor*/, 7, rowdef, false);
            // fill index with values
            kelondroRowSet collection = new kelondroRowSet(rowdef, 0);
--- a/source/de/anomic/plasma/plasmaHTCache.java
+++ b/source/de/anomic/plasma/plasmaHTCache.java
@ -49,6 +49,7 @@ import de.anomic.kelondro.kelondroBLOBBuffer;
 import de.anomic.kelondro.kelondroBLOBHeap;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroMap;
 import de.anomic.server.serverFileUtils;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacySeedDB;
 import de.anomic.yacy.yacyURL;
@ -95,7 +96,7 @@ public final class plasmaHTCache {
                    object.getName().equals("yacy") ||
                    object.getName().equals("https") ||
                    object.getName().equals("ftp")) {
-                    deleteOldHTCache(cachePath);
+                    serverFileUtils.deleteDirectory(cachePath);
                }
            }
        }
@ -140,22 +141,6 @@ public final class plasmaHTCache {
            e.printStackTrace();
        }
    }
    private static void deleteOldHTCache(final File directory) {
        final String[] list = directory.list();
        if (list != null) {
            File object;
            for (int i = list.length - 1; i >= 0; i--) {
                object = new File(directory, list[i]);
                if (object.isFile()) {
                    object.delete();
                } else {
                    deleteOldHTCache(object);
                }
            }
        }
        directory.delete();
    }
    public static int responseHeaderDBSize() {
        return responseHeaderDB.size();
--- a/source/de/anomic/plasma/plasmaRankingCRProcess.java
+++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java
@ -243,7 +243,7 @@ public class plasmaRankingCRProcess {
        if (newdb) {
            final File path = to_file.getParentFile(); // path to storage place
            newacc = new kelondroFlexTable(path, CRG_accname, CRG_accrow, 0, false);
-            newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow);
+            newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
        } else {
            if (!(to_file.exists())) {
                acc = new kelondroAttrSeq("Global Ranking Accumulator File",
@ -372,8 +372,8 @@ public class plasmaRankingCRProcess {
    public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException {
        //kelondroFlexTable       acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
-        final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow);
+        final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
-        final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli);
+        final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli, false);
        // loop over all referees
        int count = 0;
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -304,7 +304,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
        // start indexing management
        log.logConfig("Starting Indexing Management");
        final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
-        webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount);
+        final boolean useCommons = getConfigBool("index.storeCommons", false);
        webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
        crawlResults = new ResultURLs();
        // start yacy core
@ -738,7 +739,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
            final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
            final File indexSecondaryPath = (getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, ""));
            final int wordCacheMaxCount = (int) getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
-            this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount);
+            final boolean useCommons = getConfigBool("index.storeCommons", false);
            this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
        }
        // start up crawl jobs
        continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -115,7 +115,7 @@ public final class plasmaWordIndex implements indexRI {
    private final File                     queuesRoot;
    public  yacyPeerActions                peerActions;
-    public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize) {
+    public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize, boolean useCommons) {
        if (networkName == null || networkName.length() == 0) {
            log.logSevere("no network name given - shutting down");
            System.exit(0);
@ -148,7 +148,7 @@ public final class plasmaWordIndex implements indexRI {
        // create collections storage path
        final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
        if (!(textindexcollections.exists())) textindexcollections.mkdirs();
-        this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow);
+        this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow, useCommons);
        // create LURL-db
        referenceURL = new indexRepositoryReference(this.secondaryRoot);
--- a/source/de/anomic/server/serverFileUtils.java
+++ b/source/de/anomic/server/serverFileUtils.java
@ -620,4 +620,25 @@ public final class serverFileUtils {
        writer1.flush();
        return count;
    }
    /**
     * delete a directory
     * if the directory is not empty, delete also everything inside
     * @param directory
     */
    public static void deleteDirectory(final File directory) {
        final String[] list = directory.list();
        if (list != null) {
            File object;
            for (int i = list.length - 1; i >= 0; i--) {
                object = new File(directory, list[i]);
                if (object.isFile()) {
                    object.delete();
                } else {
                    deleteDirectory(object);
                }
            }
        }
        directory.delete();
    }
 }
--- a/source/yacy.java
+++ b/source/yacy.java
@ -674,7 +674,7 @@ public final class yacy {
            final int cacheMem = (int)(serverMemory.max() - serverMemory.total());
            if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
-            final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000);
+            final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
            final Iterator<indexContainer> indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
            long urlCounter = 0, wordCounter = 0;
@ -865,7 +865,7 @@ public final class yacy {
        try {
            Iterator<indexContainer> indexContainerIterator = null;
            if (resource.equals("all")) {
-                WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000);
+                WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
                indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
            }
            int counter = 0;