*) Adding DB import function which allows to import an foreign yacy DB (from directory PLASMADB)

into the DB of an other peer.
   ATTENTION: not tested very well. please use this with care and always make a db backup

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@932 6c8d7289-2bf4-0310-a012-ef5d649a1542
theli 20 years ago
parent 7ee03acce0
commit b7e21ec107

@ -46,10 +46,10 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
public class kelondroMScoreCluster {
public final class kelondroMScoreCluster {
private TreeMap refkeyDB; // a mapping from a reference to the cluster key
private TreeMap keyrefDB; // a mapping from the cluster key to the reference
private final TreeMap refkeyDB; // a mapping from a reference to the cluster key
private final TreeMap keyrefDB; // a mapping from the cluster key to the reference
private long gcount;
private int encnt;

@ -75,6 +75,7 @@ import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexClassicDB;
import de.anomic.plasma.plasmaWordIndexCache;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverCodings;
@ -678,6 +679,161 @@ public final class yacy {
public static void importDB(String homePath, String importPath) {
if (homePath == null) throw new NullPointerException();
if (importPath == null) throw new NullPointerException();
if (homePath.equals(importPath)) throw new IllegalArgumentException("Import and home DB directory must not be equal");
// configure logging
try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
serverLog log = new serverLog("DB-IMPORT");
log.logInfo("STARTING DB-IMPORT");
log.logInfo("Import can be aborted using <ctrl>+<c>");
plasmaWordIndex homeWordIndex = null, importWordIndex = null;
plasmaCrawlLURL homeUrlDB = null, importUrlDB = null;
try {
Runtime rt = Runtime.getRuntime();
// configure destination DB
File homeDBroot = new File(new File(homePath), "DATA/PLASMADB");
if ((!homeDBroot.exists())&&(!homeDBroot.canRead())&&(!homeDBroot.isDirectory())) {
log.logSevere("DB home directory can not be opened.");
log.logFine("Initializing destination word index db.");
homeWordIndex = new plasmaWordIndex(homeDBroot, 8*1024*1024, log);
log.logFine("Initializing destination URL db.");
homeUrlDB = new plasmaCrawlLURL(new File(homeDBroot, "urlHash.db"), 4*1024*1024);
// configure import DB
File importDBroot = new File(importPath);
if ((!importDBroot.exists())&&(!importDBroot.canRead())&&(!importDBroot.isDirectory())) {
log.logSevere("DB import directory can not be opened.");
log.logFine("Initializing source word index db.");
importWordIndex = new plasmaWordIndex(importDBroot, 8*1024*1024, log);
log.logFine("Initializing source URL db.");
importUrlDB = new plasmaCrawlLURL(new File(importDBroot, "urlHash.db"), 4*1024*1024);
int startSize = importWordIndex.size();
log.logInfo("Importing DB from '" + importDBroot.getAbsolutePath() + "' to '" + homeDBroot.getAbsolutePath() + "'.");
log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");
// iterate over all words from import db
String wordHash = "";
long urlCounter = 0, wordCounter = 0, entryCounter = 0;
long globalStart = System.currentTimeMillis(), wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
String wordChunkStartHash = "------------", wordChunkEndHash;
Iterator importWordHashIterator = importWordIndex.wordHashes(wordChunkStartHash, true, true);
while (importWordHashIterator.hasNext()) {
// testing if import process was aborted
if (Thread.interrupted()) break;
plasmaWordIndexEntity importWordIdxEntity = null;
try {
wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true);
if (importWordIdxEntity.size() == 0) {
// creating a container used to hold the imported entries
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
// the combined container will fit, read the container
Enumeration importWordIdxEntries = importWordIdxEntity.elements(true);
plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasMoreElements()) {
// testing if import process was aborted
if (Thread.interrupted()) break;
// getting next word index entry
importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.nextElement();
String urlHash = importWordIdxEntry.getUrlHash();
if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) {
// importing the new url
plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);
if (urlCounter % 500 == 0) {
log.logFine(urlCounter + " URLs processed so far.");
// adding word index entity to container
plasmaWordIndexEntry newEntry = new plasmaWordIndexEntry(importWordIdxEntry.toExternalForm());
if (entryCounter % 500 == 0) {
log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
// testing if import process was aborted
if (Thread.interrupted()) break;
// importing entity container to home db
homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file
// print out some statistical information
if (wordCounter%500 == 0) {
wordChunkEndHash = wordHash;
wordChunkEnd = System.currentTimeMillis();
long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " word entities imported " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "] " +
((startSize-importWordIndex.size())/(importWordIndex.size()/100)) +
"%\n" +
"Speed: "+ 500*1000/duration + " word entities/s" +
" | Elapsed time: " + serverDate.intervalToString(wordChunkEnd-globalStart) +
" | Estimated time: " + serverDate.intervalToString(importWordIndex.size()*((wordChunkEnd-globalStart)/wordCounter)) + "\n" +
"Free memory: " + rt.freeMemory() +
" | Total memory: " + rt.totalMemory() + "\n" +
"Home Words = " + homeWordIndex.size() +
" | Import Words = " + importWordIndex.size());
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
} catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");
log.logInfo("DB-IMPORT FINISHED");
} catch (Exception e) {
log.logSevere("Database import failed.",e);
} finally {
if (homeUrlDB != null) try { homeUrlDB.close(); } catch (Exception e){}
if (importUrlDB != null) try { importUrlDB.close(); } catch (Exception e){}
if (homeWordIndex != null) try { homeWordIndex.close(5000); } catch (Exception e){}
if (importWordIndex != null) try { importWordIndex.close(5000); } catch (Exception e){}
public static void minimizeUrlDB(String homePath) {
// run with "java -classpath classes yacy -migratewords"
try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
@ -935,6 +1091,18 @@ public final class yacy {
// attention: this may run long and should not be interrupted!
if (args.length == 2) applicationRoot= args[1];
} else if ((args.length >= 1) && (args[0].equals("-importDB"))) {
// attention: this may run long and should not be interrupted!
String importRoot = null;
if (args.length == 3) {
applicationRoot= args[1];
importRoot = args[2];
} else if (args.length == 2) {
importRoot = args[1];
} else {
System.err.println("Usage: -importDB [homeDbRoot] importDbRoot");
importDB(applicationRoot, importRoot);
} else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) {
// delete those words in the index that are listed in the stopwords file
if (args.length == 2) applicationRoot= args[1];
