From 4e9a8f41fdcd7e3b5fa06ece8b4d897d05c5de33 Mon Sep 17 00:00:00 2001
From: hermens <hermens@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Fri, 17 Mar 2006 23:39:10 +0000
Subject: [PATCH] rwiDBCleaner + dbImporter: Iterate over small excerpts of
 word hashes instead of the whole DB especially while changing the DB in the
 process. see http://www.yacy-forum.de/viewtopic.php?p=19136#19136

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1917 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../plasma/dbImport/plasmaDbImporter.java       | 17 +++++++++++++----
 source/de/anomic/plasma/plasmaWordIndex.java    | 11 ++++++++++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
index 339db8b47..092d51807 100644
--- a/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/dbImport/plasmaDbImporter.java
@@ -114,7 +114,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
             HashSet importedUrlBuffer = new HashSet();
 			
             // iterate over all words from import db
-            Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
+            //Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
+            Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
             while (!isAborted() && importWordHashIterator.hasNext()) {
                 
                 TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true));
@@ -124,8 +125,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
                     this.wordHash = (String) importWordHashIterator.next();
                     newContainer = this.importWordIndex.getContainer(this.wordHash, true, -1);
                     
-                    if (newContainer.size() == 0) continue;
-                    
                     // loop throug the entities of the container and get the
                     // urlhash
                     Iterator importWordIdxEntries = newContainer.entries();
@@ -175,7 +174,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
                                 continue;
                             }
                         }
-                    this.entryCounter++;
+                        this.entryCounter++;
                     }
 					
                     // testing if import process was aborted
@@ -213,6 +212,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
                 } finally {
                     if (newContainer != null) newContainer.clear();
                 }
+
+                if (!importWordHashIterator.hasNext()) {
+                    // We may not be finished yet, try to get the next chunk of wordHashes
+                    TreeSet wordHashes = this.importWordIndex.wordHashes(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
+                    importWordHashIterator = wordHashes.iterator();
+                    // Make sure we don't get the same wordhash twice, but don't skip a word
+                    if ((importWordHashIterator.hasNext())&&(!this.wordHash.equals(importWordHashIterator.next()))) {
+                        importWordHashIterator = wordHashes.iterator();
+                    }
+                }
             }
             
             this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index 11759a3ab..c18b2b563 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -591,7 +591,7 @@ public final class plasmaWordIndex {
             URL url = null;
             HashSet urlHashs = new HashSet();
             try {
-                Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false);
+                Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
                 while (wordHashIterator.hasNext() && run) {
                     waiter();
                     wordHash = (String) wordHashIterator.next();
@@ -621,6 +621,15 @@ public final class plasmaWordIndex {
                         lastDeletionCounter = urlHashs.size();
                         urlHashs.clear();
                     }
+                    if (!wordHashIterator.hasNext()) {
+                        // We may not be finished yet, try to get the next chunk of wordHashes
+                        TreeSet wordHashes = wordHashes(wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
+                        wordHashIterator = wordHashes.iterator();
+                        // Make sure we don't get the same wordhash twice, but don't skip a word
+                        if ((wordHashIterator.hasNext())&&(!wordHash.equals(wordHashIterator.next()))) {
+                            wordHashIterator = wordHashes.iterator();
+                        }
+                    }
                 }
             } catch (IOException e) {
                 serverLog.logSevere("INDEXCLEANER",