added property index.storeCommons to switch commons storage on or off

with index.storeCommons=false all currently stored commons are deleted!
Default is now 'true', but in future full releases it will be switched to 'false'

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5315 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 4b4ce75396
commit 22989d0d8a

@ -260,6 +260,13 @@ dbPath=DATA/PLASMADB
indexPrimaryPath=DATA/INDEX
indexSecondaryPath=
# the commons are words that appear in the index more than 64k times in references.
# Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking
# are stored back into the index, and references with bad ranking are sorted out. Such sorted-out references can be stored
# for later use (but there is no at this time). If the sorted-out references should be stored, the following property should be
# set to true. If set to false, they are abandoned (deleted), and previously stored commons are removed.
index.storeCommons=true
# the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS

@ -44,7 +44,7 @@ public class indexCollectionRI implements indexRI {
kelondroCollectionIndex collectionIndex;
public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow) {
public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow, boolean useCommons) {
try {
collectionIndex = new kelondroCollectionIndex(
path,
@ -53,7 +53,8 @@ public class indexCollectionRI implements indexRI {
kelondroBase64Order.enhancedCoder,
4 /*loadfactor*/,
maxpartition,
payloadrow);
payloadrow,
useCommons);
} catch (final IOException e) {
serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage());
}

@ -59,7 +59,7 @@ public class kelondroCollectionIndex {
private final int keylength;
private final File path;
private final String filenameStub;
private final File commonsPath;
private final File commonsPath1;
private final int loadfactor;
private Map<String, kelondroFixedWidthArray> arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects
private final kelondroRow payloadrow; // definition of the payload (chunks inside the collections)
@ -113,7 +113,7 @@ public class kelondroCollectionIndex {
}
public kelondroCollectionIndex(final File path, final String filenameStub, final int keyLength, final kelondroByteOrder indexOrder,
final int loadfactor, final int maxpartitions, final kelondroRow rowdef) throws IOException {
final int loadfactor, final int maxpartitions, final kelondroRow rowdef, boolean useCommons) throws IOException {
// the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree
indexErrors = 0;
this.path = path;
@ -122,8 +122,13 @@ public class kelondroCollectionIndex {
this.payloadrow = rowdef;
this.loadfactor = loadfactor;
this.maxPartitions = maxpartitions;
this.commonsPath = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons");
this.commonsPath.mkdirs();
File cop = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons");
this.commonsPath1 = (useCommons) ? cop : null;
if (this.commonsPath1 == null) {
serverFileUtils.deleteDirectory(cop);
} else {
this.commonsPath1.mkdirs();
}
final File f = new File(path, filenameStub + ".index");
if (f.exists()) {
@ -640,21 +645,22 @@ public class kelondroCollectionIndex {
serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon);
// finally dump the removed entries to a file
newcommon.sort();
final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection";
final File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath
storagePath.mkdirs();
final File file = new File(storagePath, filename);
try {
newcommon.saveCollection(file);
serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
} catch (final IOException e) {
e.printStackTrace();
serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
}
if (commonsPath1 != null) {
newcommon.sort();
final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection";
final File storagePath = new File(commonsPath1, filename.substring(0, 2)); // make a subpath
storagePath.mkdirs();
final File file = new File(storagePath, filename);
try {
newcommon.saveCollection(file);
serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
} catch (final IOException e) {
e.printStackTrace();
serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
}
}
}
public synchronized int remove(final byte[] key, final Set<String> removekeys) throws IOException, kelondroOutOfLimitsException {
@ -885,7 +891,7 @@ public class kelondroCollectionIndex {
final kelondroCollectionIndex collectionIndex = new kelondroCollectionIndex(
path, filenameStub, 9 /*keyLength*/,
kelondroNaturalOrder.naturalOrder,
4 /*loadfactor*/, 7, rowdef);
4 /*loadfactor*/, 7, rowdef, false);
// fill index with values
kelondroRowSet collection = new kelondroRowSet(rowdef, 0);

@ -49,6 +49,7 @@ import de.anomic.kelondro.kelondroBLOBBuffer;
import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroMap;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
@ -95,7 +96,7 @@ public final class plasmaHTCache {
object.getName().equals("yacy") ||
object.getName().equals("https") ||
object.getName().equals("ftp")) {
deleteOldHTCache(cachePath);
serverFileUtils.deleteDirectory(cachePath);
}
}
}
@ -140,22 +141,6 @@ public final class plasmaHTCache {
e.printStackTrace();
}
}
private static void deleteOldHTCache(final File directory) {
final String[] list = directory.list();
if (list != null) {
File object;
for (int i = list.length - 1; i >= 0; i--) {
object = new File(directory, list[i]);
if (object.isFile()) {
object.delete();
} else {
deleteOldHTCache(object);
}
}
}
directory.delete();
}
public static int responseHeaderDBSize() {
return responseHeaderDB.size();

@ -243,7 +243,7 @@ public class plasmaRankingCRProcess {
if (newdb) {
final File path = to_file.getParentFile(); // path to storage place
newacc = new kelondroFlexTable(path, CRG_accname, CRG_accrow, 0, false);
newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow);
newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
} else {
if (!(to_file.exists())) {
acc = new kelondroAttrSeq("Global Ranking Accumulator File",
@ -372,8 +372,8 @@ public class plasmaRankingCRProcess {
public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException {
//kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow);
final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli);
final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli, false);
// loop over all referees
int count = 0;

@ -304,7 +304,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// start indexing management
log.logConfig("Starting Indexing Management");
final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount);
final boolean useCommons = getConfigBool("index.storeCommons", false);
webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
crawlResults = new ResultURLs();
// start yacy core
@ -738,7 +739,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
final File indexSecondaryPath = (getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, ""));
final int wordCacheMaxCount = (int) getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount);
final boolean useCommons = getConfigBool("index.storeCommons", false);
this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
}
// start up crawl jobs
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -115,7 +115,7 @@ public final class plasmaWordIndex implements indexRI {
private final File queuesRoot;
public yacyPeerActions peerActions;
public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize) {
public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize, boolean useCommons) {
if (networkName == null || networkName.length() == 0) {
log.logSevere("no network name given - shutting down");
System.exit(0);
@ -148,7 +148,7 @@ public final class plasmaWordIndex implements indexRI {
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow);
this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow, useCommons);
// create LURL-db
referenceURL = new indexRepositoryReference(this.secondaryRoot);

@ -620,4 +620,25 @@ public final class serverFileUtils {
writer1.flush();
return count;
}
/**
* delete a directory
* if the directory is not empty, delete also everything inside
* @param directory
*/
public static void deleteDirectory(final File directory) {
final String[] list = directory.list();
if (list != null) {
File object;
for (int i = list.length - 1; i >= 0; i--) {
object = new File(directory, list[i]);
if (object.isFile()) {
object.delete();
} else {
deleteDirectory(object);
}
}
}
directory.delete();
}
}

@ -674,7 +674,7 @@ public final class yacy {
final int cacheMem = (int)(serverMemory.max() - serverMemory.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000);
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
final Iterator<indexContainer> indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0;
@ -865,7 +865,7 @@ public final class yacy {
try {
Iterator<indexContainer> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000);
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
}
int counter = 0;

Loading…
Cancel
Save