From f01bd2548990e318335acedbb1776b81e41c5735 Mon Sep 17 00:00:00 2001 From: theli Date: Fri, 9 Jun 2006 05:38:59 +0000 Subject: [PATCH] *) Bugfix for OutOfMemory problem during minimizeUrlDB See: http://www.yacy-forum.de/viewtopic.php?t=2498 *) out of date import functions removed (can be done via web gui) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2189 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/yacy.java | 291 +++-------------------------------------------- 1 file changed, 16 insertions(+), 275 deletions(-) diff --git a/source/yacy.java b/source/yacy.java index fcc36ab4e..fd7b6e7f0 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -691,251 +691,12 @@ public final class yacy { log.logInfo("FINISHED MIGRATION JOB, WAIT FOR DUMP"); wordIndexCache.close(60); log.logInfo("TERMINATED MIGRATION"); - } - - public static void importAssortment(String homePath, String importAssortmentFileName) { - if (homePath == null) throw new NullPointerException(); - if (importAssortmentFileName == null) throw new NullPointerException(); - - // initialize logging - try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {} - serverLog log = new serverLog("ASSORTMENT-IMPORT"); - log.logInfo("STARTING ASSORTMENT-IMPORT"); - - // initializing importAssortmentFile - String errorMsg = null; - File importAssortmentFile = new File(importAssortmentFileName); - if (!importAssortmentFile.exists()) errorMsg = "AssortmentFile '" + importAssortmentFile + "' does not exist."; - else if (importAssortmentFile.isDirectory()) errorMsg = "AssortmentFile '" + importAssortmentFile + "' is a directory."; - else if (!importAssortmentFile.canRead()) errorMsg = "AssortmentFile '" + importAssortmentFile + "' is not readable."; - else if (!importAssortmentFile.canWrite()) errorMsg = "AssortmentFile '" + importAssortmentFile + "' is not writeable."; - if (errorMsg != null) { - log.logSevere(errorMsg); - throw new IllegalStateException(errorMsg); - } - - File importAssortmentPath = null; - int assortmentNr = -1; - try { - importAssortmentPath = new File(importAssortmentFile.getParent()); - assortmentNr = Integer.valueOf(importAssortmentFile.getName().substring("indexAssortment".length(),importAssortmentFile.getName().length()-3)).intValue(); - } catch (NumberFormatException e) { - errorMsg = "Unable to parse the assortment file number."; - log.logSevere(errorMsg,e); - throw new IllegalStateException(errorMsg); - } - - plasmaWordIndex homeWordIndex = null; - try { - // initializing assortment source file - log.logInfo("Initializing source assortment file"); - plasmaWordIndexAssortment assortmentFile = new plasmaWordIndexAssortment(importAssortmentPath,assortmentNr,16*1024*1024, log); - - // configure destination DB - log.logInfo("Initializing destination word index db."); - File homeDBroot = new File(new File(homePath), "DATA/PLASMADB"); - if (!homeDBroot.exists()) errorMsg = "DB Directory '" + homeDBroot + "' does not exist."; - else if (!homeDBroot.isDirectory()) errorMsg = "DB Directory '" + homeDBroot + "' is not directory."; - else if (!homeDBroot.canRead()) errorMsg = "DB Directory '" + homeDBroot + "' is not readable."; - else if (!homeDBroot.canWrite()) errorMsg = "DB Directory '" + homeDBroot + "' is not writeable."; - if (errorMsg != null) { - log.logSevere(errorMsg); - throw new IllegalStateException(errorMsg); - } - - // opening the destination database - homeWordIndex = new plasmaWordIndex(homeDBroot, 16*1024*1024, log); - - // iterating through the content - log.logInfo("Importing assortment file containing '" + assortmentFile.size() + "' entities."); - - int wordEntityCount = 0, wordEntryCount = 0; - Iterator contentIter = assortmentFile.content(); - while (contentIter.hasNext()) { - wordEntityCount++; - - kelondroRow.Entry row = (kelondroRow.Entry) contentIter.next(); - String hash = row.getColString(0, null); - indexContainer container = assortmentFile.row2container(hash, row); - wordEntryCount += container.size(); - - // importing entity container to home db - homeWordIndex.addEntries(container, System.currentTimeMillis(), true); - - if (wordEntityCount % 500 == 0) { - log.logFine(wordEntityCount + " word entities processed so far."); - } - if (wordEntryCount % 2000 == 0) { - log.logFine(wordEntryCount + " word entries processed so far."); - } - } - } catch (Error e) { - log.logWarning("Error", e); - } catch (Exception e) { - log.logWarning("Exception", e); - } finally { - log.logInfo("ASSORTMENT-IMPORT FINISHED"); - if (homeWordIndex != null) try { homeWordIndex.close(5000); } catch (Exception e){/* nothing todo here */} - } - } - - public static void importDB(String homePath, String importPath) { - if (homePath == null) throw new NullPointerException(); - if (importPath == null) throw new NullPointerException(); - if (homePath.equals(importPath)) throw new IllegalArgumentException("Import and home DB directory must not be equal"); - - // configure logging - try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {} - serverLog log = new serverLog("DB-IMPORT"); - log.logInfo("STARTING DB-IMPORT"); - - plasmaWordIndex homeWordIndex = null, importWordIndex = null; - plasmaCrawlLURL homeUrlDB = null, importUrlDB = null; - try { - // - Runtime rt = Runtime.getRuntime(); - String errorMsg = null; - - // configure destination DB - File homeDBroot = new File(new File(homePath), "DATA/PLASMADB"); - if (!homeDBroot.exists()) errorMsg = "Home DB directory does not exist."; - if (!homeDBroot.canRead()) errorMsg = "Home DB directory is not readable."; - if (!homeDBroot.canWrite()) errorMsg = "Home DB directory is not writeable"; - if (!homeDBroot.isDirectory()) errorMsg = "Home DB Directory is not a directory."; - if (errorMsg != null) { - log.logSevere(errorMsg + "\nName: " + homeDBroot.getAbsolutePath()); - return; - } - - if ((!homeDBroot.exists())&&(!homeDBroot.canRead())&&(!homeDBroot.isDirectory())) { - log.logSevere("DB home directory can not be opened."); - return; - } - log.logFine("Initializing destination word index db."); - homeWordIndex = new plasmaWordIndex(homeDBroot, 8*1024*1024, log); - log.logFine("Initializing destination URL db."); - homeUrlDB = new plasmaCrawlLURL(new File(homeDBroot, "urlHash.db"), 4*1024*1024); - - // configure import DB - errorMsg = null; - File importDBroot = new File(importPath); - if (!importDBroot.exists()) errorMsg = "Import directory does not exist."; - if (!importDBroot.canRead()) errorMsg = "Import directory is not readable."; - if (!importDBroot.canWrite()) errorMsg = "Import directory is not writeable"; - if (!importDBroot.isDirectory()) errorMsg = "ImportDirectory is not a directory."; - if (errorMsg != null) { - log.logSevere(errorMsg + "\nName: " + homeDBroot.getAbsolutePath()); - return; - } - - log.logFine("Initializing source word index db."); - importWordIndex = new plasmaWordIndex(importDBroot, 8*1024*1024, log); - log.logFine("Initializing source URL db."); - importUrlDB = new plasmaCrawlLURL(new File(importDBroot, "urlHash.db"), 4*1024*1024); - int startSize = importWordIndex.size(); - - log.logInfo("Importing DB from '" + importDBroot.getAbsolutePath() + "' to '" + homeDBroot.getAbsolutePath() + "'."); - log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs."); - log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs."); - - // iterate over all words from import db - String wordHash = ""; - long urlCounter = 0, wordCounter = 0, entryCounter = 0; - long globalStart = System.currentTimeMillis(), wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0; - String wordChunkStartHash = "------------", wordChunkEndHash; - - Iterator importWordHashIterator = importWordIndex.wordHashes(wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, true); - while (importWordHashIterator.hasNext()) { - - // testing if import process was aborted - if (Thread.interrupted()) break; - - indexContainer newContainer; - try { - wordCounter++; - wordHash = (String) importWordHashIterator.next(); - newContainer = importWordIndex.getContainer(wordHash, true, -1); - - if (newContainer.size() == 0) continue; - - // the combined container will fit, read the container - Iterator importWordIdxEntries = newContainer.entries(); - indexURLEntry importWordIdxEntry; - while (importWordIdxEntries.hasNext()) { - - // testing if import process was aborted - if (Thread.interrupted()) break; - - // getting next word index entry - entryCounter++; - importWordIdxEntry = (indexURLEntry) importWordIdxEntries.next(); - String urlHash = importWordIdxEntry.getUrlHash(); - if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try { - // importing the new url - plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash, null); - urlCounter++; - plasmaCrawlLURL.Entry homeEntry = homeUrlDB.newEntry(urlEntry); - homeEntry.store(); - - if (urlCounter % 500 == 0) { - log.logFine(urlCounter + " URLs processed so far."); - } - } catch (IOException e) {} - - if (entryCounter % 500 == 0) { - log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far."); - } - } - - // testing if import process was aborted - if (Thread.interrupted()) break; - - // importing entity container to home db - homeWordIndex.addEntries(newContainer, System.currentTimeMillis(), true); - - // delete complete index entity file - importWordIndex.deleteContainer(wordHash); - - // print out some statistical information - if (wordCounter%500 == 0) { - wordChunkEndHash = wordHash; - wordChunkEnd = System.currentTimeMillis(); - long duration = wordChunkEnd - wordChunkStart; - log.logInfo(wordCounter + " word entities imported " + - "[" + wordChunkStartHash + " .. " + wordChunkEndHash + "] " + - ((startSize-importWordIndex.size())/(importWordIndex.size()/100)) + - "%\n" + - "Speed: "+ 500*1000/duration + " word entities/s" + - " | Elapsed time: " + serverDate.intervalToString(wordChunkEnd-globalStart) + - " | Estimated time: " + serverDate.intervalToString(importWordIndex.size()*((wordChunkEnd-globalStart)/wordCounter)) + "\n" + - "Free memory: " + rt.freeMemory() + - " | Total memory: " + rt.totalMemory() + "\n" + - "Home Words = " + homeWordIndex.size() + - " | Import Words = " + importWordIndex.size()); - wordChunkStart = wordChunkEnd; - wordChunkStartHash = wordChunkEndHash; - } - - } catch (Exception e) { - log.logSevere("Import of word entity '" + wordHash + "' failed.",e); - } finally { - } - } - - log.logInfo("Home word index contains " + homeWordIndex.size() + " words and " + homeUrlDB.size() + " URLs."); - log.logInfo("Import word index contains " + importWordIndex.size() + " words and " + importUrlDB.size() + " URLs."); - - log.logInfo("DB-IMPORT FINISHED"); - } catch (Exception e) { - log.logSevere("Database import failed.",e); - } finally { - if (homeUrlDB != null) try { homeUrlDB.close(); } catch (Exception e){} - if (importUrlDB != null) try { importUrlDB.close(); } catch (Exception e){} - if (homeWordIndex != null) try { homeWordIndex.close(5000); } catch (Exception e){} - if (importWordIndex != null) try { importWordIndex.close(5000); } catch (Exception e){} - } - } + } + /** + * @param homePath path to the YaCy directory + * @param dbcache cache size in MB + */ public static void minimizeUrlDB(String homePath, int dbcache) { // run with "java -classpath classes yacy -minimizeUrlDB" try {serverLog.configureLogging(new File(homePath, "yacy.logging"));} catch (Exception e) {} @@ -945,7 +706,7 @@ public final class yacy { log.logInfo("STARTING URL CLEANUP"); // db containing all currently loades urls - int cache = dbcache * 1024 * 1024; + int cache = dbcache * 1024; // in KB log.logFine("URLDB-Caches: "+cache+" bytes"); plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), cache); @@ -953,7 +714,9 @@ public final class yacy { plasmaCrawlLURL minimizedUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.temp.db"), cache); Runtime rt = Runtime.getRuntime(); - int cacheMem = (int)(rt.maxMemory()-rt.totalMemory())-5*1024*1024; + int cacheMem = (int)((rt.maxMemory()-rt.totalMemory())/1024)-(2*cache + 8*1024); + if (cacheMem < 2048) throw new OutOfMemoryError("Not enough memory available to start clean up."); + plasmaWordIndex wordIndex = new plasmaWordIndex(dbroot, cacheMem, log); Iterator wordHashIterator = wordIndex.wordHashes("------------", plasmaWordIndex.RL_WORDFILES, false); @@ -1015,12 +778,16 @@ public final class yacy { log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries."); log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries."); - // TODO: rename the mimimized UrlDB to the name of the previous UrlDB + // TODO: rename the mimimized UrlDB to the name of the previous UrlDB log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP"); + log.logInfo("You can now backup your old URL DB and rename urlHash.temp.db to urlHash.db"); + log.logInfo("TERMINATED URL CLEANUP"); - } catch (IOException e) { - log.logSevere("IOException", e); + } catch (Exception e) { + log.logSevere("Exception: " + e.getMessage(), e); + } catch (Error e) { + log.logSevere("Error: " + e.getMessage(), e); } } @@ -1460,32 +1227,6 @@ public final class yacy { } if (args.length == 2) applicationRoot= args[1]; minimizeUrlDB(applicationRoot, dbcache); - } else if ((args.length >= 1) && (args[0].equals("-importDB"))) { - // attention: this may run long and should not be interrupted! - String importRoot = null; - if (args.length == 3) { - applicationRoot= args[1]; - importRoot = args[2]; - } else if (args.length == 2) { - importRoot = args[1]; - } else { - System.err.println("Usage: -importDB [homeDbRoot] importDbRoot"); - return; - } - importDB(applicationRoot, importRoot); - } else if ((args.length >= 1) && (args[0].equals("-importAssortment"))) { - // attention: this may run long and should not be interrupted! - String assortmentFileName = null; - if (args.length == 3) { - applicationRoot= args[1]; - assortmentFileName = args[2]; - } else if (args.length == 2) { - assortmentFileName = args[1]; - } else { - System.err.println("Usage: -importAssortment [homeDbRoot] [AssortmentFileName]"); - return; - } - importAssortment(applicationRoot, assortmentFileName); } else if ((args.length >= 1) && (args[0].equals("-testPeerDB"))) { if (args.length == 2) { applicationRoot= args[1];