try to fix new 100% cpu bug, possibly caused by iterator method

see http://www.yacy-forum.de/viewtopic.php?p=18900#18900 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1888 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 283a7181c6
parent 5ddc32f49a
commit 283a7181c6
2 changed files with 94 additions and 1 deletions
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@ -169,6 +169,82 @@ public class plasmaDHTChunk {
        return;
    }

+    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
+        // the hash is a start hash from where the indexes are picked
+        ArrayList tmpContainers = new ArrayList(maxcount);
+        try {
+            String[] wordHashes = wordIndex.wordHashes(hash, resourceLevel, true, maxcount);
+            plasmaWordIndexEntryContainer indexContainer;
+            Iterator urlIter;
+            plasmaWordIndexEntry indexEntry;
+            plasmaCrawlLURL.Entry lurl;
+            int refcount = 0;
+
+            urlCache = new HashMap();
+            while ((maxcount > refcount) && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(wordHashes[refcount], ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < 0.2))) {
+                // make an on-the-fly entity and insert values
+                indexContainer = wordIndex.getContainer(wordHashes[refcount], true, 10000);
+                int notBoundCounter = 0;
+                try {
+                    urlIter = indexContainer.entries();
+                    // iterate over indexes to fetch url entries and store them in the urlCache
+                    while ((urlIter.hasNext()) && (maxcount > refcount)) {
+                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
+                        try {
+                            lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
+                            if ((lurl == null) || (lurl.url() == null)) {
+                                notBoundCounter++;
+                                urlIter.remove();
+                                wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
+                            } else {
+                                urlCache.put(indexEntry.getUrlHash(), lurl);
+                                refcount++;
+                            }
+                        } catch (IOException e) {
+                            notBoundCounter++;
+                            urlIter.remove();
+                            wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
+                        }
+                    }
+
+                    // remove all remaining; we have enough
+                    while (urlIter.hasNext()) {
+                        indexEntry = (plasmaWordIndexEntry) urlIter.next();
+                        urlIter.remove();
+                    }
+
+                    // use whats left
+                    log.logFine("Selected partial index (" + indexContainer.size() + " from " + wordIndex.indexSize(wordHashes[refcount-1]) + " URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
+                    tmpContainers.add(indexContainer);
+                } catch (kelondroException e) {
+                    log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + wordHashes[refcount], e);
+                    wordIndex.deleteIndex(wordHashes[refcount]);
+                }
+            }
+            // create result
+            indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
+
+            if ((indexContainers == null) || (indexContainers.length == 0)) {
+                log.logFine("No index available for index transfer, hash start-point " + startPointHash);
+                this.status = chunkStatus_FAILED;
+                return 0;
+            }
+
+            this.status = chunkStatus_FILLED;
+            
+            return refcount;
+        } catch (kelondroException e) {
+            log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
+            indexContainers = new plasmaWordIndexEntryContainer[0];
+            urlCache = new HashMap();
+            
+            this.status = chunkStatus_FAILED;
+            
+            return 0;
+        }
+    }
+
+    /*
    private int  selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
        // the hash is a start hash from where the indexes are picked
        ArrayList tmpContainers = new ArrayList(maxcount);
@ -245,7 +321,8 @@ public class plasmaDHTChunk {
            return 0;
        }
    }
-
+    */
+    
    public int deleteTransferIndexes() {
        Iterator urlIter;
        plasmaWordIndexEntry indexEntry;
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -380,6 +380,22 @@ public final class plasmaWordIndex {
    public static final int RL_ASSORTMENTS = 2;
    public static final int RL_WORDFILES   = 3;
    
+    public synchronized String[] wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
+        String[] hashes = new String[count];
+        Iterator i = wordHashes(startHash, resourceLevel, rot);
+        int j = 0;
+        while ((count-- > 0) && (i.hasNext())) {
+            hashes[j++] = (String) i.next();
+        }
+        if (count > 0) {
+            String[] s = new String[j];
+            System.arraycopy(hashes, 0, s, 0, j);
+            return s;
+        } else {
+            return hashes;
+        }
+    }
+    
    public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) {
        if (rot) return new rotatingWordIterator(startHash, resourceLevel);
        else return new correctedWordIterator(startHash, resourceLevel, rot); // use correction until bug is found