fix for svn 1888

this is a redesign of the no-iterator solution git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1892 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · a6a3f4b694
parent 1fc494858d
commit a6a3f4b694
4 changed files with 17 additions and 93 deletions
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -284,7 +284,7 @@ public class IndexControl_p {

        // generate list
        if (post.containsKey("keyhashsimilar")) {
-            final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true);
+            final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
            StringBuffer result = new StringBuffer("Sequential List of Word-Hashes:<br>");
            String hash;
            int i = 0;
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@ -169,88 +169,12 @@ public class plasmaDHTChunk {
        return;
    }

-    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
-        // the hash is a start hash from where the indexes are picked
-        ArrayList tmpContainers = new ArrayList(maxcount);
-        try {
-            String[] wordHashes = wordIndex.wordHashes(hash, resourceLevel, true, maxcount);
-            plasmaWordIndexEntryContainer indexContainer;
-            Iterator urlIter;
-            plasmaWordIndexEntry indexEntry;
-            plasmaCrawlLURL.Entry lurl;
-            int refcount = 0;
-
-            urlCache = new HashMap();
-            while ((maxcount > refcount) && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(wordHashes[refcount], ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < 0.2))) {
-                // make an on-the-fly entity and insert values
-                indexContainer = wordIndex.getContainer(wordHashes[refcount], true, 10000);
-                int notBoundCounter = 0;
-                try {
-                    urlIter = indexContainer.entries();
-                    // iterate over indexes to fetch url entries and store them in the urlCache
-                    while ((urlIter.hasNext()) && (maxcount > refcount)) {
-                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
-                        try {
-                            lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
-                            if ((lurl == null) || (lurl.url() == null)) {
-                                notBoundCounter++;
-                                urlIter.remove();
-                                wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
-                            } else {
-                                urlCache.put(indexEntry.getUrlHash(), lurl);
-                                refcount++;
-                            }
-                        } catch (IOException e) {
-                            notBoundCounter++;
-                            urlIter.remove();
-                            wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
-                        }
-                    }
-
-                    // remove all remaining; we have enough
-                    while (urlIter.hasNext()) {
-                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
-                        urlIter.remove();
-                    }
-
-                    // use whats left
-                    log.logFine("Selected partial index (" + indexContainer.size() + " from " + wordIndex.indexSize(wordHashes[refcount-1]) + " URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
-                    tmpContainers.add(indexContainer);
-                } catch (kelondroException e) {
-                    log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + wordHashes[refcount], e);
-                    wordIndex.deleteIndex(wordHashes[refcount]);
-                }
-            }
-            // create result
-            indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
-
-            if ((indexContainers == null) || (indexContainers.length == 0)) {
-                log.logFine("No index available for index transfer, hash start-point " + startPointHash);
-                this.status = chunkStatus_FAILED;
-                return 0;
-            }
-
-            this.status = chunkStatus_FILLED;
-            
-            return refcount;
-        } catch (kelondroException e) {
-            log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
-            indexContainers = new plasmaWordIndexEntryContainer[0];
-            urlCache = new HashMap();
-            
-            this.status = chunkStatus_FAILED;
-            
-            return 0;
-        }
-    }
-
-    /*
-    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
+    private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
        // the hash is a start hash from where the indexes are picked
        ArrayList tmpContainers = new ArrayList(maxcount);
        String nexthash = "";
        try {
-            Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true);
+            Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true, maxcount).iterator();
            plasmaWordIndexEntryContainer indexContainer;
            Iterator urlIter;
            plasmaWordIndexEntry indexEntry;
@ -321,7 +245,7 @@ public class plasmaDHTChunk {
            return 0;
        }
    }
-    */
+    
    
    public int deleteTransferIndexes() {
        Iterator urlIter;
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -54,6 +54,7 @@ import java.util.Map;
 import java.util.HashSet;
 import java.util.Set;
 import java.util.Date;
+import java.util.TreeSet;
 import java.net.URL;

 import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -380,20 +381,15 @@ public final class plasmaWordIndex {
    public static final int RL_ASSORTMENTS = 2;
    public static final int RL_WORDFILES   = 3;
    
-    public synchronized String[] wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
-        String[] hashes = new String[count];
+    public synchronized TreeSet wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
+        TreeSet hashes = new TreeSet();
        Iterator i = wordHashes(startHash, resourceLevel, rot);
-        int j = 0;
-        while ((count-- > 0) && (i.hasNext())) {
-            hashes[j++] = (String) i.next();
-        }
-        if (count > 0) {
-            String[] s = new String[j];
-            System.arraycopy(hashes, 0, s, 0, j);
-            return s;
-        } else {
-            return hashes;
+        String hash;
+        while ((hashes.size() < count) && (i.hasNext())) {
+            hash = (String) i.next();
+            if ((hash != null) && (hash.length() > 0)) hashes.add(hash);
        }
+        return hashes;
    }
    
    public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) {
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -56,7 +56,7 @@ import de.anomic.kelondro.kelondroRecords;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacySeedDB;

-public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ {
+public final class plasmaWordIndexCache implements plasmaWordIndexInterface {

    // environment constants
    private static final String indexArrayFileName = "indexDump1.array";
@ -277,6 +277,10 @@ public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/
        return kCache.size();
    }

+    public int size() {
+        return wCache.size() + kCache.size();
+    }
+
    public int indexSize(String wordHash) {
        int size = 0;
        plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash);