fix for svn 1888

this is a redesign of the no-iterator solution git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1892 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · a6a3f4b694
parent 1fc494858d
commit a6a3f4b694
4 changed files with 17 additions and 93 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -284,7 +284,7 @@ public class IndexControl_p {
        // generate list
        if (post.containsKey("keyhashsimilar")) {
-            final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true);
+            final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
            StringBuffer result = new StringBuffer("Sequential List of Word-Hashes:<br>");
            String hash;
            int i = 0;
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@ -169,88 +169,12 @@ public class plasmaDHTChunk {
        return;
    }
-    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
+    private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
        // the hash is a start hash from where the indexes are picked
        ArrayList tmpContainers = new ArrayList(maxcount);
        try {
            String[] wordHashes = wordIndex.wordHashes(hash, resourceLevel, true, maxcount);
            plasmaWordIndexEntryContainer indexContainer;
            Iterator urlIter;
            plasmaWordIndexEntry indexEntry;
            plasmaCrawlLURL.Entry lurl;
            int refcount = 0;
            urlCache = new HashMap();
            while ((maxcount > refcount) && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(wordHashes[refcount], ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < 0.2))) {
                // make an on-the-fly entity and insert values
                indexContainer = wordIndex.getContainer(wordHashes[refcount], true, 10000);
                int notBoundCounter = 0;
                try {
                    urlIter = indexContainer.entries();
                    // iterate over indexes to fetch url entries and store them in the urlCache
                    while ((urlIter.hasNext()) && (maxcount > refcount)) {
                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
                        try {
                            lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
                            if ((lurl == null) || (lurl.url() == null)) {
                                notBoundCounter++;
                                urlIter.remove();
                                wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
                            } else {
                                urlCache.put(indexEntry.getUrlHash(), lurl);
                                refcount++;
                            }
                        } catch (IOException e) {
                            notBoundCounter++;
                            urlIter.remove();
                            wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
                        }
                    }
                    // remove all remaining; we have enough
                    while (urlIter.hasNext()) {
                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
                        urlIter.remove();
                    }
                    // use whats left
                    log.logFine("Selected partial index (" + indexContainer.size() + " from " + wordIndex.indexSize(wordHashes[refcount-1]) + " URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
                    tmpContainers.add(indexContainer);
                } catch (kelondroException e) {
                    log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + wordHashes[refcount], e);
                    wordIndex.deleteIndex(wordHashes[refcount]);
                }
            }
            // create result
            indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
            if ((indexContainers == null) || (indexContainers.length == 0)) {
                log.logFine("No index available for index transfer, hash start-point " + startPointHash);
                this.status = chunkStatus_FAILED;
                return 0;
            }
            this.status = chunkStatus_FILLED;
            return refcount;
        } catch (kelondroException e) {
            log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
            indexContainers = new plasmaWordIndexEntryContainer[0];
            urlCache = new HashMap();
            this.status = chunkStatus_FAILED;
            return 0;
        }
    }
    /*
    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
        // the hash is a start hash from where the indexes are picked
        ArrayList tmpContainers = new ArrayList(maxcount);
        String nexthash = "";
        try {
-            Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true);
+            Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true, maxcount).iterator();
            plasmaWordIndexEntryContainer indexContainer;
            Iterator urlIter;
            plasmaWordIndexEntry indexEntry;
@ -321,7 +245,7 @@ public class plasmaDHTChunk {
            return 0;
        }
    }
-    */
+    
    public int deleteTransferIndexes() {
        Iterator urlIter;
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -54,6 +54,7 @@ import java.util.Map;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.Date;
 import java.util.TreeSet;
 import java.net.URL;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -380,20 +381,15 @@ public final class plasmaWordIndex {
    public static final int RL_ASSORTMENTS = 2;
    public static final int RL_WORDFILES   = 3;
-    public synchronized String[] wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
+    public synchronized TreeSet wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
-        String[] hashes = new String[count];
+        TreeSet hashes = new TreeSet();
        Iterator i = wordHashes(startHash, resourceLevel, rot);
-        int j = 0;
+        String hash;
-        while ((count-- > 0) && (i.hasNext())) {
+        while ((hashes.size() < count) && (i.hasNext())) {
-            hashes[j++] = (String) i.next();
+            hash = (String) i.next();
-        }
+            if ((hash != null) && (hash.length() > 0)) hashes.add(hash);
        if (count > 0) {
            String[] s = new String[j];
            System.arraycopy(hashes, 0, s, 0, j);
            return s;
        } else {
            return hashes;
        }
        return hashes;
    }
    public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) {
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -56,7 +56,7 @@ import de.anomic.kelondro.kelondroRecords;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacySeedDB;
-public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ {
+public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
    // environment constants
    private static final String indexArrayFileName = "indexDump1.array";
@ -277,6 +277,10 @@ public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/
        return kCache.size();
    }
    public int size() {
        return wCache.size() + kCache.size();
    }
    public int indexSize(String wordHash) {
        int size = 0;
        plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash);