fix for svn 1888

this is a redesign of the no-iterator solution

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1892 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 1fc494858d
commit a6a3f4b694

@ -284,7 +284,7 @@ public class IndexControl_p {
// generate list // generate list
if (post.containsKey("keyhashsimilar")) { if (post.containsKey("keyhashsimilar")) {
final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true); final Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, plasmaWordIndex.RL_WORDFILES, true, 256).iterator();
StringBuffer result = new StringBuffer("Sequential List of Word-Hashes:<br>"); StringBuffer result = new StringBuffer("Sequential List of Word-Hashes:<br>");
String hash; String hash;
int i = 0; int i = 0;

@ -169,88 +169,12 @@ public class plasmaDHTChunk {
return; return;
} }
private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) { private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
// the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(maxcount);
try {
String[] wordHashes = wordIndex.wordHashes(hash, resourceLevel, true, maxcount);
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry lurl;
int refcount = 0;
urlCache = new HashMap();
while ((maxcount > refcount) && ((tmpContainers.size() == 0) || (yacyDHTAction.dhtDistance(wordHashes[refcount], ((plasmaWordIndexEntryContainer) tmpContainers.get(0)).wordHash()) < 0.2))) {
// make an on-the-fly entity and insert values
indexContainer = wordIndex.getContainer(wordHashes[refcount], true, 10000);
int notBoundCounter = 0;
try {
urlIter = indexContainer.entries();
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = lurls.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
} else {
urlCache.put(indexEntry.getUrlHash(), lurl);
refcount++;
}
} catch (IOException e) {
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntries(wordHashes[refcount], new String[] { indexEntry.getUrlHash() }, true);
}
}
// remove all remaining; we have enough
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlIter.remove();
}
// use whats left
log.logFine("Selected partial index (" + indexContainer.size() + " from " + wordIndex.indexSize(wordHashes[refcount-1]) + " URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
tmpContainers.add(indexContainer);
} catch (kelondroException e) {
log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + wordHashes[refcount], e);
wordIndex.deleteIndex(wordHashes[refcount]);
}
}
// create result
indexContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
this.status = chunkStatus_FAILED;
return 0;
}
this.status = chunkStatus_FILLED;
return refcount;
} catch (kelondroException e) {
log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
indexContainers = new plasmaWordIndexEntryContainer[0];
urlCache = new HashMap();
this.status = chunkStatus_FAILED;
return 0;
}
}
/*
private int selectTransferContainersResource(String hash, int resourceLevel, int maxcount) {
// the hash is a start hash from where the indexes are picked // the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(maxcount); ArrayList tmpContainers = new ArrayList(maxcount);
String nexthash = ""; String nexthash = "";
try { try {
Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true); Iterator wordHashIterator = wordIndex.wordHashes(hash, resourceLevel, true, maxcount).iterator();
plasmaWordIndexEntryContainer indexContainer; plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter; Iterator urlIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
@ -321,7 +245,7 @@ public class plasmaDHTChunk {
return 0; return 0;
} }
} }
*/
public int deleteTransferIndexes() { public int deleteTransferIndexes() {
Iterator urlIter; Iterator urlIter;

@ -54,6 +54,7 @@ import java.util.Map;
import java.util.HashSet; import java.util.HashSet;
import java.util.Set; import java.util.Set;
import java.util.Date; import java.util.Date;
import java.util.TreeSet;
import java.net.URL; import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -380,20 +381,15 @@ public final class plasmaWordIndex {
public static final int RL_ASSORTMENTS = 2; public static final int RL_ASSORTMENTS = 2;
public static final int RL_WORDFILES = 3; public static final int RL_WORDFILES = 3;
public synchronized String[] wordHashes(String startHash, int resourceLevel, boolean rot, int count) { public synchronized TreeSet wordHashes(String startHash, int resourceLevel, boolean rot, int count) {
String[] hashes = new String[count]; TreeSet hashes = new TreeSet();
Iterator i = wordHashes(startHash, resourceLevel, rot); Iterator i = wordHashes(startHash, resourceLevel, rot);
int j = 0; String hash;
while ((count-- > 0) && (i.hasNext())) { while ((hashes.size() < count) && (i.hasNext())) {
hashes[j++] = (String) i.next(); hash = (String) i.next();
} if ((hash != null) && (hash.length() > 0)) hashes.add(hash);
if (count > 0) {
String[] s = new String[j];
System.arraycopy(hashes, 0, s, 0, j);
return s;
} else {
return hashes;
} }
return hashes;
} }
public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) { public Iterator wordHashes(String startHash, int resourceLevel, boolean rot) {

@ -56,7 +56,7 @@ import de.anomic.kelondro.kelondroRecords;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/ { public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// environment constants // environment constants
private static final String indexArrayFileName = "indexDump1.array"; private static final String indexArrayFileName = "indexDump1.array";
@ -277,6 +277,10 @@ public final class plasmaWordIndexCache /*implements plasmaWordIndexInterface*/
return kCache.size(); return kCache.size();
} }
public int size() {
return wCache.size() + kCache.size();
}
public int indexSize(String wordHash) { public int indexSize(String wordHash) {
int size = 0; int size = 0;
plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash); plasmaWordIndexEntryContainer cacheIndex = (plasmaWordIndexEntryContainer) wCache.get(wordHash);

Loading…
Cancel
Save