added option to configure a path to a secondary index location.

this shall be used to store a fragment of the index on another physical device,
to split IO load and enhance access speed. The index is splitted in such a way
that the LURLs are stored to the secondary location, and the RWIs to the primary
location. This is especially useful for environments where symbolic links are
not possible and may cause IO access even if there is no write access to the
device which hosts the symbolic link.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3519 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 07cd30cf9b
commit 5c3afb3202

@ -48,8 +48,18 @@
</tr>
<tr class="TableCellLight">
<td title="Path to the PLASMADB directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importPath" type="text" size="50" value="" /></td>
<td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
<td colspan="3"><input name="importPlasmaPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the primary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexPrimaryPath" type="text" size="50" value="" /></td>
<td></td>
</tr>
<tr class="TableCellLight">
<td title="Path to the secondary Index directory of the foreign peer">Import&nbsp;Path:</td>
<td colspan="3"><input name="importIndexSecondaryPath" type="text" size="50" value="" /></td>
<td><input type="submit" name="startIndexDbImport" value="Start Import" /></td>
</tr>
</table>
<p class="warning"><strong>Attention:</strong><br />Always do a backup of your source and destination database before starting to use this import function.</p>

@ -73,7 +73,9 @@ public final class IndexImport_p {
if (post.containsKey("startIndexDbImport")) {
try {
// getting the import path
String importPath = (String) post.get("importPath");
String importPlasmaPath = (String) post.get("importPlasmaPath");
String importIndexPrimaryPath = (String) post.get("importIndexPrimaryPath");
String importIndexSecondaryPath = (String) post.get("importIndexSecondaryPath");
String importType = (String) post.get("importType");
String cacheSizeStr = (String) post.get("cacheSize");
int cacheSize = 8*1024*1024;
@ -98,7 +100,7 @@ public final class IndexImport_p {
if (startImport) {
dbImporter importerThread = switchboard.dbImportManager.getNewImporter(importType);
if (importerThread != null) {
importerThread.init(new File(importPath), cacheSize, 100);
importerThread.init(new File(importPlasmaPath), new File(importIndexPrimaryPath), new File(importIndexSecondaryPath), cacheSize, 100);
importerThread.startIt();
}
prop.put("LOCATION","");

@ -13,7 +13,7 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
protected boolean stopped = false;
protected boolean paused = false;
protected File importPath;
protected File importPrimaryPath, importSecondaryPath;
protected int cacheSize;
protected long preloadTime;
@ -33,9 +33,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.error;
}
public void init(File theImportPath) {
if (theImportPath == null) throw new NullPointerException("The Import path must not be null.");
this.importPath = theImportPath;
public void init(File thePrimaryPath, File theSecondaryPath) {
if (thePrimaryPath == null) throw new NullPointerException("The Primary Import path must not be null.");
if (theSecondaryPath == null) throw new NullPointerException("The Secondary Import path must not be null.");
this.importPrimaryPath = thePrimaryPath;
this.importSecondaryPath = theSecondaryPath;
// getting a job id from the import manager
//this.jobID = this.sb.dbImportManager.getJobID();
@ -115,8 +117,11 @@ public abstract class AbstractImporter extends Thread implements dbImporter{
return this.jobType;
}
public File getImportPath() {
return this.importPath;
public File getPrimaryImportPath() {
return this.importPrimaryPath;
}
public File getSecondaryImportPath() {
return this.importSecondaryPath;
}
public abstract long getEstimatedTime();

@ -20,10 +20,11 @@ public interface dbImporter {
public int getJobID();
public String getJobName();
public String getJobType();
public File getImportPath();
public File getPrimaryImportPath();
public File getSecondaryImportPath();
public String getError();
public String getStatus();
public void init(File indexPath, int cacheSize, long preloadTime);
public void init(File plasmaPath, File indexPrimaryPath, File indexSecondaryPath, int cacheSize, long preloadTime);
public void startIt();
}

@ -31,7 +31,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
public String getJobName() {
return this.importPath.toString();
return this.importPrimaryPath.toString();
}
public int getProcessingStatusPercent() {
@ -47,23 +47,23 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
return theStatus.toString();
}
public void init(File theImportPath, int theCacheSize, long preloadTime) {
super.init(theImportPath);
public void init(File plasmaPath, File indexPrimary, File indexSecondary, int theCacheSize, long preloadTime) {
super.init(indexPrimary, indexSecondary);
this.cacheSize = theCacheSize;
this.preloadTime = preloadTime;
File noticeUrlDbFile = new File(this.importPath,"urlNotice1.db");
File profileDbFile = new File(this.importPath, "crawlProfiles0.db");
File noticeUrlDbFile = new File(plasmaPath,"urlNotice1.db");
File profileDbFile = new File(plasmaPath, "crawlProfiles0.db");
String errorMsg = null;
if (!this.importPath.exists())
errorMsg = "The import path '" + this.importPath + "' does not exist.";
else if (!this.importPath.isDirectory())
errorMsg = "The import path '" + this.importPath + "' is not a directory.";
else if (!this.importPath.canRead())
errorMsg = "The import path '" + this.importPath + "' is not readable.";
else if (!this.importPath.canWrite())
errorMsg = "The import path '" + this.importPath + "' is not writeable.";
if (!plasmaPath.exists())
errorMsg = "The import path '" + plasmaPath+ "' does not exist.";
else if (!plasmaPath.isDirectory())
errorMsg = "The import path '" + plasmaPath + "' is not a directory.";
else if (!plasmaPath.canRead())
errorMsg = "The import path '" + plasmaPath + "' is not readable.";
else if (!plasmaPath.canWrite())
errorMsg = "The import path '" + plasmaPath + "' is not writeable.";
else if (!noticeUrlDbFile.exists())
errorMsg = "The noticeUrlDB file '" + noticeUrlDbFile + "' does not exist.";
@ -90,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new plasmaCrawlNURL(this.importPath);
this.importNurlDB = new plasmaCrawlNURL(plasmaPath);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();

@ -31,7 +31,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
}
public String getJobName() {
return this.importPath.toString();
return this.importPrimaryPath.toString();
}
public String getStatus() {
@ -46,25 +46,33 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
return theStatus.toString();
}
public void init(File theImportPath, int theCacheSize, long preloadTime) {
super.init(theImportPath);
public void init(File plasmaPath, File thePrimaryPath, File theSecondaryPath, int theCacheSize, long preloadTime) {
super.init(thePrimaryPath, theSecondaryPath);
this.cacheSize = theCacheSize;
if (this.cacheSize < 2*1024*1024) this.cacheSize = 8*1024*1024;
// configure import DB
String errorMsg = null;
if (!this.importPath.exists()) errorMsg = "Import directory does not exist.";
if (!this.importPath.canRead()) errorMsg = "Import directory is not readable.";
if (!this.importPath.canWrite()) errorMsg = "Import directory is not writeable";
if (!this.importPath.isDirectory()) errorMsg = "ImportDirectory is not a directory.";
if (!this.importPrimaryPath.exists()) errorMsg = "Primary Import directory does not exist.";
if (!this.importPrimaryPath.canRead()) errorMsg = "Primary Import directory is not readable.";
if (!this.importPrimaryPath.canWrite()) errorMsg = "Primary Import directory is not writeable";
if (!this.importPrimaryPath.isDirectory()) errorMsg = "Primary Import directory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importPath.getAbsolutePath());
this.log.logSevere(errorMsg + "\nName: " + this.importPrimaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
}
if (!this.importSecondaryPath.exists()) errorMsg = "Secondary Import directory does not exist.";
if (!this.importSecondaryPath.canRead()) errorMsg = "Secondary Import directory is not readable.";
if (!this.importSecondaryPath.canWrite()) errorMsg = "Secondary Import directory is not writeable";
if (!this.importSecondaryPath.isDirectory()) errorMsg = "Secondary Import directory is not a directory.";
if (errorMsg != null) {
this.log.logSevere(errorMsg + "\nName: " + this.importSecondaryPath.getAbsolutePath());
throw new IllegalArgumentException(errorMsg);
}
this.log.logFine("Initializing source word index db.");
this.importWordIndex = new plasmaWordIndex(this.importPath, preloadTime / 2, this.log);
this.importWordIndex = new plasmaWordIndex(this.importPrimaryPath, importSecondaryPath, preloadTime / 2, this.log);
this.importStartSize = this.importWordIndex.size();
}
@ -93,8 +101,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
public void importWordsDB() {
this.log.logInfo("STARTING DB-IMPORT");
try {
this.log.logInfo("Importing DB from '" + this.importPath.getAbsolutePath() + "'");
try {
this.log.logInfo("Importing DB from '" + this.importPrimaryPath.getAbsolutePath() + "'/'" + this.importSecondaryPath.getAbsolutePath() + "'");
this.log.logInfo("Home word index contains " + wi.size() + " words and " + wi.loadedURL.size() + " URLs.");
this.log.logInfo("Import word index contains " + this.importWordIndex.size() + " words and " + this.importWordIndex.loadedURL.size() + " URLs.");

@ -201,7 +201,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// storage management
public File htCachePath;
private File plasmaPath;
public File indexPath;
public File indexPrimaryPath, indexSecondaryPath;
public File listsPath;
public File htDocsPath;
public File rankingPath;
@ -728,7 +728,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* <p>Name of the setting specifying the folder beginning from the YaCy-installation's top-folder, where the
* whole database of known RWIs and URLs as well as dumps of the DHT-In and DHT-Out caches are stored</p>
*/
public static final String INDEX_PATH = "indexPath";
public static final String INDEX_PRIMARY_PATH = "indexPrimaryPath"; // this is a relative path to the data root
public static final String INDEX_SECONDARY_PATH = "indexSecondaryPath"; // this is a absolute path to any location
public static final String INDEX_PATH_DEFAULT = "DATA/INDEX";
/**
* <p><code>public static final String <strong>LISTS_PATH</strong> = "listsPath"</code></p>
@ -868,8 +869,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// load values from configs
this.plasmaPath = new File(rootPath, getConfig(DBPATH, DBPATH_DEFAULT));
this.log.logConfig("Plasma DB Path: " + this.plasmaPath.toString());
this.indexPath = new File(rootPath, getConfig(INDEX_PATH, INDEX_PATH_DEFAULT));
this.log.logConfig("Index Path: " + this.indexPath.toString());
this.indexPrimaryPath = new File(rootPath, getConfig(INDEX_PRIMARY_PATH, INDEX_PATH_DEFAULT));
this.log.logConfig("Index Primary Path: " + this.indexPrimaryPath.toString());
this.indexSecondaryPath = (getConfig(INDEX_SECONDARY_PATH, "").length() == 0) ? indexPrimaryPath : new File(getConfig(INDEX_SECONDARY_PATH, ""));
this.log.logConfig("Index Secondary Path: " + this.indexSecondaryPath.toString());
this.listsPath = new File(rootPath, getConfig(LISTS_PATH, LISTS_PATH_DEFAULT));
this.log.logConfig("Lists Path: " + this.listsPath.toString());
this.htDocsPath = new File(rootPath, getConfig(HTDOCS_PATH, HTDOCS_PATH_DEFAULT));
@ -1040,7 +1043,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
noticeURL = new plasmaCrawlNURL(plasmaPath);
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log);
wordIndex = new plasmaWordIndex(indexPrimaryPath, indexSecondaryPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically
int wordCacheMaxCount = Math.max((int) getConfigLong(WORDCACHE_INIT_COUNT, 30000),

@ -70,19 +70,19 @@ public final class plasmaWordIndex implements indexRI {
private int flushsize;
public final plasmaCrawlLURL loadedURL;
public plasmaWordIndex(File indexRoot, long preloadTime, serverLog log) {
File textindexcache = new File(indexRoot, "PUBLIC/TEXT/RICACHE");
public plasmaWordIndex(File indexPrimaryRoot, File indexSecondaryRoot, long preloadTime, serverLog log) {
File textindexcache = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
this.dhtOutCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump1.array", log);
this.dhtInCache = new indexRAMRI(textindexcache, indexRWIEntry.urlEntryRow, wCacheMaxChunk, wCacheMaxAge, "dump2.array", log);
// create collections storage path
File textindexcollections = new File(indexRoot, "PUBLIC/TEXT/RICOLLECTION");
File textindexcollections = new File(indexPrimaryRoot, "PUBLIC/TEXT/RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new indexCollectionRI(textindexcollections, "collection", preloadTime, maxCollectionPartition, indexRWIEntry.urlEntryRow);
// create LURL-db
loadedURL = new plasmaCrawlLURL(indexRoot, preloadTime);
loadedURL = new plasmaCrawlLURL(indexSecondaryRoot, preloadTime);
// performance settings
busyCacheFlush = false;

@ -616,14 +616,15 @@ public final class yacy {
public static void minimizeUrlDB(String homePath) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
File indexRoot = new File(new File(homePath), "DATA/INDEX");
File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexRoot2 = new File(new File(homePath), "DATA/INDEX2");
serverLog log = new serverLog("URL-CLEANUP");
try {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexRoot, 10000);
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(indexSecondaryRoot, 10000);
// db used to hold all neede urls
plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(indexRoot2, 10000);
@ -632,7 +633,7 @@ public final class yacy {
int cacheMem = (int)(serverMemory.max-rt.totalMemory());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
plasmaWordIndex wordIndex = new plasmaWordIndex(indexRoot, 10000, log);
plasmaWordIndex wordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 10000, log);
Iterator indexContainerIterator = wordIndex.wordContainers("AAAAAAAAAAAA", false, false);
long urlCounter = 0, wordCounter = 0;
@ -1000,7 +1001,8 @@ public final class yacy {
private static void RWIHashList(String homePath, String targetName, String resource, String format) {
plasmaWordIndex WordIndex = null;
serverLog log = new serverLog("HASHLIST");
File indexRoot = new File(new File(homePath), "DATA/INDEX");
File indexPrimaryRoot = new File(new File(homePath), "DATA/INDEX");
File indexSecondaryRoot = new File(new File(homePath), "DATA/INDEX");
String wordChunkStartHash = "AAAAAAAAAAAA";
try {serverLog.configureLogging(new File(homePath, "DATA/LOG/yacy.logging"));} catch (Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
@ -1008,7 +1010,7 @@ public final class yacy {
try {
Iterator indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex(indexRoot, 3000, log);
WordIndex = new plasmaWordIndex(indexPrimaryRoot, indexSecondaryRoot, 3000, log);
indexContainerIterator = WordIndex.wordContainers(wordChunkStartHash, false, false);
}
int counter = 0;

@ -198,7 +198,10 @@ promoteSearchPageGreeting =
dbPath=DATA/PLASMADB
# the path to the public reverse word index for text files (web pages)
indexPath=DATA/INDEX
# the primary path is relative to the data root, the secondary path is an absolute path
# when the secondary path should be equal to the primary, it must be declared empty
indexPrimaryPath=DATA/INDEX
indexSecondaryPath=
# the path to the LISTS files. Most lists are used to filter web content
listsPath=DATA/LISTS

Loading…
Cancel
Save