added property index.storeCommons to switch commons storage on or off

with index.storeCommons=false all currently stored commons are deleted!
Default is now 'true', but in future full releases it will be switched to 'false'

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5315 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 17 years ago
parent 4b4ce75396
commit 22989d0d8a

@ -260,6 +260,13 @@ dbPath=DATA/PLASMADB
indexPrimaryPath=DATA/INDEX indexPrimaryPath=DATA/INDEX
indexSecondaryPath= indexSecondaryPath=
# the commons are words that appear in the index more than 64k times in references.
# Since indexes with such size cannot be handled efficiently, they are sorted in such a way that references with high ranking
# are stored back into the index, and references with bad ranking are sorted out. Such sorted-out references can be stored
# for later use (but there is no at this time). If the sorted-out references should be stored, the following property should be
# set to true. If set to false, they are abandoned (deleted), and previously stored commons are removed.
index.storeCommons=true
# the path to the LISTS files. Most lists are used to filter web content # the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS listsPath=DATA/LISTS

@ -44,7 +44,7 @@ public class indexCollectionRI implements indexRI {
kelondroCollectionIndex collectionIndex; kelondroCollectionIndex collectionIndex;
public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow) { public indexCollectionRI(final File path, final String filenameStub, final int maxpartition, final kelondroRow payloadrow, boolean useCommons) {
try { try {
collectionIndex = new kelondroCollectionIndex( collectionIndex = new kelondroCollectionIndex(
path, path,
@ -53,7 +53,8 @@ public class indexCollectionRI implements indexRI {
kelondroBase64Order.enhancedCoder, kelondroBase64Order.enhancedCoder,
4 /*loadfactor*/, 4 /*loadfactor*/,
maxpartition, maxpartition,
payloadrow); payloadrow,
useCommons);
} catch (final IOException e) { } catch (final IOException e) {
serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage()); serverLog.logSevere("PLASMA", "unable to open collection index at " + path.toString() + ":" + e.getMessage());
} }

@ -59,7 +59,7 @@ public class kelondroCollectionIndex {
private final int keylength; private final int keylength;
private final File path; private final File path;
private final String filenameStub; private final String filenameStub;
private final File commonsPath; private final File commonsPath1;
private final int loadfactor; private final int loadfactor;
private Map<String, kelondroFixedWidthArray> arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects private Map<String, kelondroFixedWidthArray> arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects
private final kelondroRow payloadrow; // definition of the payload (chunks inside the collections) private final kelondroRow payloadrow; // definition of the payload (chunks inside the collections)
@ -113,7 +113,7 @@ public class kelondroCollectionIndex {
} }
public kelondroCollectionIndex(final File path, final String filenameStub, final int keyLength, final kelondroByteOrder indexOrder, public kelondroCollectionIndex(final File path, final String filenameStub, final int keyLength, final kelondroByteOrder indexOrder,
final int loadfactor, final int maxpartitions, final kelondroRow rowdef) throws IOException { final int loadfactor, final int maxpartitions, final kelondroRow rowdef, boolean useCommons) throws IOException {
// the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree // the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree
indexErrors = 0; indexErrors = 0;
this.path = path; this.path = path;
@ -122,8 +122,13 @@ public class kelondroCollectionIndex {
this.payloadrow = rowdef; this.payloadrow = rowdef;
this.loadfactor = loadfactor; this.loadfactor = loadfactor;
this.maxPartitions = maxpartitions; this.maxPartitions = maxpartitions;
this.commonsPath = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons"); File cop = new File(path, filenameStub + "." + fillZ(Integer.toHexString(rowdef.objectsize).toUpperCase(), 4) + ".commons");
this.commonsPath.mkdirs(); this.commonsPath1 = (useCommons) ? cop : null;
if (this.commonsPath1 == null) {
serverFileUtils.deleteDirectory(cop);
} else {
this.commonsPath1.mkdirs();
}
final File f = new File(path, filenameStub + ".index"); final File f = new File(path, filenameStub + ".index");
if (f.exists()) { if (f.exists()) {
@ -640,21 +645,22 @@ public class kelondroCollectionIndex {
serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon); serverLog.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon);
// finally dump the removed entries to a file // finally dump the removed entries to a file
newcommon.sort(); if (commonsPath1 != null) {
final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss"); newcommon.sort();
formatter.setTimeZone(TimeZone.getTimeZone("GMT")); final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss");
final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection"; formatter.setTimeZone(TimeZone.getTimeZone("GMT"));
final File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath final String filename = serverCodings.encodeHex(kelondroBase64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection";
storagePath.mkdirs(); final File storagePath = new File(commonsPath1, filename.substring(0, 2)); // make a subpath
final File file = new File(storagePath, filename); storagePath.mkdirs();
try { final File file = new File(storagePath, filename);
newcommon.saveCollection(file); try {
serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); newcommon.saveCollection(file);
} catch (final IOException e) { serverLog.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
e.printStackTrace(); } catch (final IOException e) {
serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); e.printStackTrace();
} serverLog.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size());
}
}
} }
public synchronized int remove(final byte[] key, final Set<String> removekeys) throws IOException, kelondroOutOfLimitsException { public synchronized int remove(final byte[] key, final Set<String> removekeys) throws IOException, kelondroOutOfLimitsException {
@ -885,7 +891,7 @@ public class kelondroCollectionIndex {
final kelondroCollectionIndex collectionIndex = new kelondroCollectionIndex( final kelondroCollectionIndex collectionIndex = new kelondroCollectionIndex(
path, filenameStub, 9 /*keyLength*/, path, filenameStub, 9 /*keyLength*/,
kelondroNaturalOrder.naturalOrder, kelondroNaturalOrder.naturalOrder,
4 /*loadfactor*/, 7, rowdef); 4 /*loadfactor*/, 7, rowdef, false);
// fill index with values // fill index with values
kelondroRowSet collection = new kelondroRowSet(rowdef, 0); kelondroRowSet collection = new kelondroRowSet(rowdef, 0);

@ -49,6 +49,7 @@ import de.anomic.kelondro.kelondroBLOBBuffer;
import de.anomic.kelondro.kelondroBLOBHeap; import de.anomic.kelondro.kelondroBLOBHeap;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
@ -95,7 +96,7 @@ public final class plasmaHTCache {
object.getName().equals("yacy") || object.getName().equals("yacy") ||
object.getName().equals("https") || object.getName().equals("https") ||
object.getName().equals("ftp")) { object.getName().equals("ftp")) {
deleteOldHTCache(cachePath); serverFileUtils.deleteDirectory(cachePath);
} }
} }
} }
@ -140,22 +141,6 @@ public final class plasmaHTCache {
e.printStackTrace(); e.printStackTrace();
} }
} }
private static void deleteOldHTCache(final File directory) {
final String[] list = directory.list();
if (list != null) {
File object;
for (int i = list.length - 1; i >= 0; i--) {
object = new File(directory, list[i]);
if (object.isFile()) {
object.delete();
} else {
deleteOldHTCache(object);
}
}
}
directory.delete();
}
public static int responseHeaderDBSize() { public static int responseHeaderDBSize() {
return responseHeaderDB.size(); return responseHeaderDB.size();

@ -243,7 +243,7 @@ public class plasmaRankingCRProcess {
if (newdb) { if (newdb) {
final File path = to_file.getParentFile(); // path to storage place final File path = to_file.getParentFile(); // path to storage place
newacc = new kelondroFlexTable(path, CRG_accname, CRG_accrow, 0, false); newacc = new kelondroFlexTable(path, CRG_accname, CRG_accrow, 0, false);
newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow); newseq = new kelondroCollectionIndex(path, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
} else { } else {
if (!(to_file.exists())) { if (!(to_file.exists())) {
acc = new kelondroAttrSeq("Global Ranking Accumulator File", acc = new kelondroAttrSeq("Global Ranking Accumulator File",
@ -372,8 +372,8 @@ public class plasmaRankingCRProcess {
public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException { public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException {
//kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true); //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow); final kelondroCollectionIndex seq = new kelondroCollectionIndex(cr_path_in, CRG_seqname, 12, kelondroBase64Order.enhancedCoder, 2, 9, CRG_colrow, false);
final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli); final kelondroCollectionIndex rci = new kelondroCollectionIndex(rci_path_out, RCI_colname, 6, kelondroBase64Order.enhancedCoder, 2, 9, RCI_coli, false);
// loop over all referees // loop over all referees
int count = 0; int count = 0;

@ -304,7 +304,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
// start indexing management // start indexing management
log.logConfig("Starting Indexing Management"); log.logConfig("Starting Indexing Management");
final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""); final String networkName = getConfig(plasmaSwitchboardConstants.NETWORK_NAME, "");
webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount); final boolean useCommons = getConfigBool("index.storeCommons", false);
webIndex = new plasmaWordIndex(networkName, log, indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
crawlResults = new ResultURLs(); crawlResults = new ResultURLs();
// start yacy core // start yacy core
@ -738,7 +739,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT); final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
final File indexSecondaryPath = (getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "")); final File indexSecondaryPath = (getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(plasmaSwitchboardConstants.INDEX_SECONDARY_PATH, ""));
final int wordCacheMaxCount = (int) getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000); final int wordCacheMaxCount = (int) getConfigLong(plasmaSwitchboardConstants.WORDCACHE_MAX_COUNT, 20000);
this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount); final boolean useCommons = getConfigBool("index.storeCommons", false);
this.webIndex = new plasmaWordIndex(getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""), getLog(), indexPrimaryPath, indexSecondaryPath, wordCacheMaxCount, useCommons);
} }
// start up crawl jobs // start up crawl jobs
continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); continueCrawlJob(plasmaSwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);

@ -115,7 +115,7 @@ public final class plasmaWordIndex implements indexRI {
private final File queuesRoot; private final File queuesRoot;
public yacyPeerActions peerActions; public yacyPeerActions peerActions;
public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize) { public plasmaWordIndex(final String networkName, final serverLog log, final File indexPrimaryRoot, final File indexSecondaryRoot, final int entityCacheMaxSize, boolean useCommons) {
if (networkName == null || networkName.length() == 0) { if (networkName == null || networkName.length() == 0) {
log.logSevere("no network name given - shutting down"); log.logSevere("no network name given - shutting down");
System.exit(0); System.exit(0);
@ -148,7 +148,7 @@ public final class plasmaWordIndex implements indexRI {
// create collections storage path // create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION"); final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs(); if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow); this.collections = new indexCollectionRI(textindexcollections, "collection", maxCollectionPartition, indexRWIRowEntry.urlEntryRow, useCommons);
// create LURL-db // create LURL-db
referenceURL = new indexRepositoryReference(this.secondaryRoot); referenceURL = new indexRepositoryReference(this.secondaryRoot);

@ -620,4 +620,25 @@ public final class serverFileUtils {
writer1.flush(); writer1.flush();
return count; return count;
} }
/**
* delete a directory
* if the directory is not empty, delete also everything inside
* @param directory
*/
public static void deleteDirectory(final File directory) {
final String[] list = directory.list();
if (list != null) {
File object;
for (int i = list.length - 1; i >= 0; i--) {
object = new File(directory, list[i]);
if (object.isFile()) {
object.delete();
} else {
deleteDirectory(object);
}
}
}
directory.delete();
}
} }

@ -674,7 +674,7 @@ public final class yacy {
final int cacheMem = (int)(serverMemory.max() - serverMemory.total()); final int cacheMem = (int)(serverMemory.max() - serverMemory.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000); final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
final Iterator<indexContainer> indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false); final Iterator<indexContainer> indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0; long urlCounter = 0, wordCounter = 0;
@ -865,7 +865,7 @@ public final class yacy {
try { try {
Iterator<indexContainer> indexContainerIterator = null; Iterator<indexContainer> indexContainerIterator = null;
if (resource.equals("all")) { if (resource.equals("all")) {
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000); WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false); indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
} }
int counter = 0; int counter = 0;

Loading…
Cancel
Save