From 4e9a8f41fdcd7e3b5fa06ece8b4d897d05c5de33 Mon Sep 17 00:00:00 2001 From: hermens Date: Fri, 17 Mar 2006 23:39:10 +0000 Subject: [PATCH] rwiDBCleaner + dbImporter: Iterate over small excerpts of word hashes instead of the whole DB especially while changing the DB in the process. see http://www.yacy-forum.de/viewtopic.php?p=19136#19136 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1917 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../plasma/dbImport/plasmaDbImporter.java | 17 +++++++++++++---- source/de/anomic/plasma/plasmaWordIndex.java | 11 ++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java index 339db8b47..092d51807 100644 --- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java @@ -114,7 +114,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { HashSet importedUrlBuffer = new HashSet(); // iterate over all words from import db - Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); + //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false); + Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator(); while (!isAborted() && importWordHashIterator.hasNext()) { TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true)); @@ -124,8 +125,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { this.wordHash = (String) importWordHashIterator.next(); newContainer = this.importWordIndex.getContainer(this.wordHash, true, -1); - if (newContainer.size() == 0) continue; - // loop throug the entities of the container and get the // urlhash Iterator importWordIdxEntries = newContainer.entries(); @@ -175,7 +174,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { continue; } } - this.entryCounter++; + this.entryCounter++; } // testing if import process was aborted @@ -213,6 +212,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter { } finally { if (newContainer != null) newContainer.clear(); } + + if (!importWordHashIterator.hasNext()) { + // We may not be finished yet, try to get the next chunk of wordHashes + TreeSet wordHashes = this.importWordIndex.wordHashes(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100); + importWordHashIterator = wordHashes.iterator(); + // Make sure we don't get the same wordhash twice, but don't skip a word + if ((importWordHashIterator.hasNext())&&(!this.wordHash.equals(importWordHashIterator.next()))) { + importWordHashIterator = wordHashes.iterator(); + } + } } this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs."); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 11759a3ab..c18b2b563 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -591,7 +591,7 @@ public final class plasmaWordIndex { URL url = null; HashSet urlHashs = new HashSet(); try { - Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false); + Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator(); while (wordHashIterator.hasNext() && run) { waiter(); wordHash = (String) wordHashIterator.next(); @@ -621,6 +621,15 @@ public final class plasmaWordIndex { lastDeletionCounter = urlHashs.size(); urlHashs.clear(); } + if (!wordHashIterator.hasNext()) { + // We may not be finished yet, try to get the next chunk of wordHashes + TreeSet wordHashes = wordHashes(wordHash, plasmaWordIndex.RL_WORDFILES, false, 100); + wordHashIterator = wordHashes.iterator(); + // Make sure we don't get the same wordhash twice, but don't skip a word + if ((wordHashIterator.hasNext())&&(!wordHash.equals(wordHashIterator.next()))) { + wordHashIterator = wordHashes.iterator(); + } + } } } catch (IOException e) { serverLog.logSevere("INDEXCLEANER",