*) Adding DB import function which allows to import an foreign yacy DB (from directory PLASMADB)

into the DB of an other peer. ATTENTION: not tested very well. please use this with care and always make a db backup git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@932 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · b7e21ec107
parent 7ee03acce0
commit b7e21ec107
2 changed files with 171 additions and 3 deletions
--- a/source/de/anomic/kelondro/kelondroMScoreCluster.java
+++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java
@ -46,10 +46,10 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.TreeMap;
-public class kelondroMScoreCluster {
+public final class kelondroMScoreCluster {
-    private TreeMap refkeyDB; // a mapping from a reference to the cluster key
+    private final TreeMap refkeyDB; // a mapping from a reference to the cluster key
-    private TreeMap keyrefDB; // a mapping from the cluster key to the reference
+    private final TreeMap keyrefDB; // a mapping from the cluster key to the reference
    private long gcount;
    private int encnt;
--- a/source/yacy.java
+++ b/source/yacy.java
@ -75,6 +75,7 @@ import de.anomic.plasma.plasmaWordIndexEntity;
 import de.anomic.plasma.plasmaWordIndexEntry;
 import de.anomic.plasma.plasmaWordIndexClassicDB;
 import de.anomic.plasma.plasmaWordIndexCache;
 import de.anomic.plasma.plasmaWordIndexEntryContainer;
 import de.anomic.server.serverCore;
 import de.anomic.server.serverDate;
 import de.anomic.server.serverCodings;
@ -678,6 +679,161 @@ public final class yacy {
        }
    }
    public static void importDB(String homePath, String importPath) {
        if (homePath == null) throw new NullPointerException();
        if (importPath == null) throw new NullPointerException();
        if (homePath.equals(importPath)) throw new IllegalArgumentException("Import and home DB directory must not be equal");
        // configure logging                    
        try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
        serverLog log = new serverLog("DB-IMPORT");
        log.logInfo("STARTING DB-IMPORT");  
        log.logInfo("Import can be aborted using <ctrl>+<c>");
        plasmaWordIndex homeWordIndex = null, importWordIndex = null;
        plasmaCrawlLURL homeUrlDB = null, importUrlDB = null;
        try {                                        
            //
            Runtime rt = Runtime.getRuntime();
            // configure destination DB
            File homeDBroot = new File(new File(homePath), "DATA/PLASMADB");
            if ((!homeDBroot.exists())&&(!homeDBroot.canRead())&&(!homeDBroot.isDirectory())) {
                log.logSevere("DB home directory can not be opened.");
                return;
            }
            log.logFine("Initializing destination word index db.");
            homeWordIndex = new plasmaWordIndex(homeDBroot, 8*1024*1024, log);
            log.logFine("Initializing destination URL db.");
            homeUrlDB = new plasmaCrawlLURL(new File(homeDBroot, "urlHash.db"), 4*1024*1024);            
            // configure import DB
            File importDBroot = new File(importPath);
            if ((!importDBroot.exists())&&(!importDBroot.canRead())&&(!importDBroot.isDirectory())) {
                log.logSevere("DB import directory can not be opened.");
                return;
            }     
            log.logFine("Initializing source word index db.");
            importWordIndex = new plasmaWordIndex(importDBroot, 8*1024*1024, log);
            log.logFine("Initializing source URL db.");
            importUrlDB = new plasmaCrawlLURL(new File(importDBroot, "urlHash.db"), 4*1024*1024);
            int startSize = importWordIndex.size();
            log.logInfo("Importing DB from '" + importDBroot.getAbsolutePath() + "' to '" + homeDBroot.getAbsolutePath() + "'.");
            log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
            log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");                        
            // iterate over all words from import db
            String wordHash = "";
            long urlCounter = 0, wordCounter = 0, entryCounter = 0;
            long globalStart = System.currentTimeMillis(), wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
            String wordChunkStartHash = "------------", wordChunkEndHash;
            Iterator importWordHashIterator = importWordIndex.wordHashes(wordChunkStartHash, true, true);
            while (importWordHashIterator.hasNext()) {
                // testing if import process was aborted
                if (Thread.interrupted()) break;
                plasmaWordIndexEntity importWordIdxEntity = null;
                try {
                    wordCounter++;
                    wordHash = (String) importWordHashIterator.next();
                    importWordIdxEntity = importWordIndex.getEntity(wordHash, true);
                    if (importWordIdxEntity.size() == 0) {
                        importWordIdxEntity.deleteComplete();
                        continue;
                    }
                    // creating a container used to hold the imported entries
                    plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
                    // the combined container will fit, read the container
                    Enumeration importWordIdxEntries = importWordIdxEntity.elements(true);
                    plasmaWordIndexEntry importWordIdxEntry;
                    while (importWordIdxEntries.hasMoreElements()) {
                        // testing if import process was aborted
                        if (Thread.interrupted()) break;
                        // getting next word index entry
                        entryCounter++;
                        importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.nextElement();
                        String urlHash = importWordIdxEntry.getUrlHash();                    
                        if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) {
                            urlCounter++;
                            // importing the new url
                            plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);                       
                            homeUrlDB.newEntry(urlEntry);
                            if (urlCounter % 500 == 0) {
                                log.logFine(urlCounter + " URLs processed so far.");
                            }
                        }
                        // adding word index entity to container
                        plasmaWordIndexEntry newEntry = new plasmaWordIndexEntry(importWordIdxEntry.toExternalForm());
                        newContainer.add(newEntry,System.currentTimeMillis());
                        if (entryCounter % 500 == 0) {
                            log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
                        }
                    }
                    // testing if import process was aborted
                    if (Thread.interrupted()) break;
                    // importing entity container to home db
                    homeWordIndex.addEntries(newContainer, true);
                    // delete complete index entity file
                    importWordIdxEntity.close();
                    importWordIndex.deleteIndex(wordHash);                 
                    // print out some statistical information
                    if (wordCounter%500 == 0) {
                        wordChunkEndHash = wordHash;
                        wordChunkEnd = System.currentTimeMillis();
                        long duration = wordChunkEnd - wordChunkStart;
                        log.logInfo(wordCounter + " word entities imported " +
                                "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "] " +
                                ((startSize-importWordIndex.size())/(importWordIndex.size()/100)) + 
                                 "%\n" + 
                                "Speed: "+ 500*1000/duration + " word entities/s" +
                                " | Elapsed time: " + serverDate.intervalToString(wordChunkEnd-globalStart) +
                                " | Estimated time: " + serverDate.intervalToString(importWordIndex.size()*((wordChunkEnd-globalStart)/wordCounter)) + "\n" + 
                                "Free memory: " + rt.freeMemory() + 
                                " | Total memory: " + rt.totalMemory() + "\n" + 
                                "Home Words = " + homeWordIndex.size() + 
                                " | Import Words = " + importWordIndex.size());
                        wordChunkStart = wordChunkEnd;
                        wordChunkStartHash = wordChunkEndHash;
                    }                    
                } catch (Exception e) {
                    log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
                } finally {
                    if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
                }
            }
            log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs.");
            log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs.");
            log.logInfo("DB-IMPORT FINISHED");
        } catch (Exception e) {
            log.logSevere("Database import failed.",e);
            e.printStackTrace();
        } finally {
            if (homeUrlDB != null) try { homeUrlDB.close(); } catch (Exception e){}
            if (importUrlDB != null) try { importUrlDB.close(); } catch (Exception e){}
            if (homeWordIndex != null) try { homeWordIndex.close(5000); } catch (Exception e){}
            if (importWordIndex != null) try { importWordIndex.close(5000); } catch (Exception e){}
        }
    }
    public static void minimizeUrlDB(String homePath) {
        // run with "java -classpath classes yacy -migratewords"
        try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {}
@ -935,6 +1091,18 @@ public final class yacy {
            // attention: this may run long and should not be interrupted!
            if (args.length == 2) applicationRoot= args[1];
            minimizeUrlDB(applicationRoot);
        } else if ((args.length >= 1) && (args[0].equals("-importDB"))) {
            // attention: this may run long and should not be interrupted!
            String importRoot = null;
            if (args.length == 3) { 
                applicationRoot= args[1];
                importRoot = args[2];            
            } else if (args.length == 2) {
                importRoot = args[1];
            } else {
                System.err.println("Usage: -importDB [homeDbRoot] importDbRoot");
            }
            importDB(applicationRoot, importRoot);            
        } else if ((args.length >= 1) && (args[0].equals("-deletestopwords"))) {
            // delete those words in the index that are listed in the stopwords file
            if (args.length == 2) applicationRoot= args[1];