rwiDBCleaner + dbImporter: Iterate over small excerpts of

word hashes instead of the whole DB especially while changing
the DB in the process.
see http://www.yacy-forum.de/viewtopic.php?p=19136#19136



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1917 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hermens 19 years ago
parent 474379ae63
commit 4e9a8f41fd

@ -114,7 +114,8 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
HashSet importedUrlBuffer = new HashSet();
// iterate over all words from import db
Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
//Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false);
Iterator importWordHashIterator = this.importWordIndex.wordHashes(this.wordChunkStartHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
while (!isAborted() && importWordHashIterator.hasNext()) {
TreeSet entityUrls = new TreeSet(new kelondroNaturalOrder(true));
@ -124,8 +125,6 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
this.wordHash = (String) importWordHashIterator.next();
newContainer = this.importWordIndex.getContainer(this.wordHash, true, -1);
if (newContainer.size() == 0) continue;
// loop throug the entities of the container and get the
// urlhash
Iterator importWordIdxEntries = newContainer.entries();
@ -175,7 +174,7 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
continue;
}
}
this.entryCounter++;
this.entryCounter++;
}
// testing if import process was aborted
@ -213,6 +212,16 @@ public class plasmaDbImporter extends AbstractImporter implements dbImporter {
} finally {
if (newContainer != null) newContainer.clear();
}
if (!importWordHashIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet wordHashes = this.importWordIndex.wordHashes(this.wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
importWordHashIterator = wordHashes.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((importWordHashIterator.hasNext())&&(!this.wordHash.equals(importWordHashIterator.next()))) {
importWordHashIterator = wordHashes.iterator();
}
}
}
this.log.logInfo("Home word index contains " + this.homeWordIndex.size() + " words and " + this.homeUrlDB.size() + " URLs.");

@ -591,7 +591,7 @@ public final class plasmaWordIndex {
URL url = null;
HashSet urlHashs = new HashSet();
try {
Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false);
Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false, 100).iterator();
while (wordHashIterator.hasNext() && run) {
waiter();
wordHash = (String) wordHashIterator.next();
@ -621,6 +621,15 @@ public final class plasmaWordIndex {
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
}
if (!wordHashIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
TreeSet wordHashes = wordHashes(wordHash, plasmaWordIndex.RL_WORDFILES, false, 100);
wordHashIterator = wordHashes.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((wordHashIterator.hasNext())&&(!wordHash.equals(wordHashIterator.next()))) {
wordHashIterator = wordHashes.iterator();
}
}
}
} catch (IOException e) {
serverLog.logSevere("INDEXCLEANER",

Loading…
Cancel
Save