From a6a3f4b694d3720baab057b103add705de09b727 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 15 Mar 2006 16:01:42 +0000 Subject: [PATCH] fix for svn 1888 this is a redesign of the no-iterator solution git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1892 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 2 +- source/de/anomic/plasma/plasmaDHTChunk.java | 82 +------------------ source/de/anomic/plasma/plasmaWordIndex.java | 20 ++--- .../anomic/plasma/plasmaWordIndexCache.java | 6 +- 4 files changed, 17 insertions(+), 93 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 5d400b009..6d99487e1 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -284,7 +284,7 @@ public class IndexControl_p { // generate list if (post.containsKey("keyhashsimilar")) { - final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true); + final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator(); StringBuffer result = new StringBuffer("Sequential List of Word-Hashes:
"); String hash; int i = 0; diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index 253b73adc..31407ffc9 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -169,88 +169,12 @@ public class plasmaDHTChunk { return; } - private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) { - // the hash is a start hash from where the indexes are picked - ArrayList tmpContainers = new ArrayList(maxcount); - try { - String[] wordHashes = wordIndex.wordHashes(hash, resourceLevel, true, maxcount); - plasmaWordIndexEntryContainer indexContainer; - Iterator urlIter; - plasmaWordIndexEntry indexEntry; - plasmaCrawlLURL.Entry lurl; - int refcount = 0; - - urlCache = new HashMap(); - while ((maxcount > refcount) && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(wordHashes[refcount], ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < 0.2))) { - // make an on-the-fly entity and insert values - indexContainer = wordIndex.getContainer(wordHashes[refcount], true, 10000); - int notBoundCounter = 0; - try { - urlIter = indexContainer.entries(); - // iterate over indexes to fetch url entries and store them in the urlCache - while ((urlIter.hasNext()) && (maxcount > refcount)) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); - try { - lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry); - if ((lurl == null) || (lurl.url() == null)) { - notBoundCounter++; - urlIter.remove(); - wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true); - } else { - urlCache.put(indexEntry.getUrlHash(), lurl); - refcount++; - } - } catch (IOException e) { - notBoundCounter++; - urlIter.remove(); - wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true); - } - } - - // remove all remaining; we have enough - while (urlIter.hasNext()) { - indexEntry = (plasmaWordIndexEntry) urlIter.next(); - urlIter.remove(); - } - - // use whats left - log.logFine("Selected partial index (" + indexContainer.size() + " from " + wordIndex.indexSize(wordHashes[refcount-1]) + " URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash()); - tmpContainers.add(indexContainer); - } catch (kelondroException e) { - log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + wordHashes[refcount], e); - wordIndex.deleteIndex(wordHashes[refcount]); - } - } - // create result - indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); - - if ((indexContainers == null) || (indexContainers.length == 0)) { - log.logFine("No index available for index transfer, hash start-point " + startPointHash); - this.status = chunkStatus_FAILED; - return 0; - } - - this.status = chunkStatus_FILLED; - - return refcount; - } catch (kelondroException e) { - log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); - indexContainers = new plasmaWordIndexEntryContainer[0]; - urlCache = new HashMap(); - - this.status = chunkStatus_FAILED; - - return 0; - } - } - - /* - private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) { + private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) { // the hash is a start hash from where the indexes are picked ArrayList tmpContainers = new ArrayList(maxcount); String nexthash = ""; try { - Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true); + Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true, maxcount).iterator(); plasmaWordIndexEntryContainer indexContainer; Iterator urlIter; plasmaWordIndexEntry indexEntry; @@ -321,7 +245,7 @@ public class plasmaDHTChunk { return 0; } } - */ + public int deleteTransferIndexes() { Iterator urlIter; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index ba738c65e..f190b32d8 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -54,6 +54,7 @@ import java.util.Map; import java.util.HashSet; import java.util.Set; import java.util.Date; +import java.util.TreeSet; import java.net.URL; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -380,20 +381,15 @@ public final class plasmaWordIndex { public static final int RL_ASSORTMENTS = 2; public static final int RL_WORDFILES = 3; - public synchronized String[] wordHashes(String startHash, int resourceLevel, boolean rot, int count) { - String[] hashes = new String[count]; + public synchronized TreeSet wordHashes(String startHash, int resourceLevel, boolean rot, int count) { + TreeSet hashes = new TreeSet(); Iterator i = wordHashes(startHash, resourceLevel, rot); - int j = 0; - while ((count-- > 0) && (i.hasNext())) { - hashes[j++] = (String) i.next(); - } - if (count > 0) { - String[] s = new String[j]; - System.arraycopy(hashes, 0, s, 0, j); - return s; - } else { - return hashes; + String hash; + while ((hashes.size() < count) && (i.hasNext())) { + hash = (String) i.next(); + if ((hash != null) && (hash.length() > 0)) hashes.add(hash); } + return hashes; } public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index c0fab2c2c..84e6eb3de 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -56,7 +56,7 @@ import de.anomic.kelondro.kelondroRecords; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; -public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ { +public final class plasmaWordIndexCache implements plasmaWordIndexInterface { // environment constants private static final String indexArrayFileName = "indexDump1.array"; @@ -277,6 +277,10 @@ public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ return kCache.size(); } + public int size() { + return wCache.size() + kCache.size(); + } + public int indexSize(String wordHash) { int size = 0; plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash);