- removed some usage of indexEntity

- changed index collection process: indexes are not first flushed to indexEntity,
  but now collected directly from ram cache and assortments

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1489 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 09dc7bbcd7
commit fa90c3ca7a

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.423
releaseVersion=0.424
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -60,7 +60,6 @@ import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects;
@ -150,22 +149,15 @@ public class IndexControl_p {
if (post.containsKey("keyhashdeleteall")) {
if (delurl || delurlref) {
// generate an urlx array
plasmaWordIndexEntity index = null;
try {
index = switchboard.wordIndex.getEntity(keyhash, true, -1);
Iterator en = index.elements(true);
plasmaWordIndexEntryContainer index = null;
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
Iterator en = index.entries();
int i = 0;
urlx = new String[index.size()];
while (en.hasNext()) {
urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
}
index.close();
index = null;
} catch (IOException e) {
urlx = new String[0];
} finally {
if (index != null) try { index.close(); } catch (Exception e) {}
}
}
if (delurlref) {
for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
@ -256,12 +248,12 @@ public class IndexControl_p {
}
prop.put("urlstring", "");
prop.put("urlhash", "");
plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1];
plasmaWordIndexEntryContainer index;
String result;
long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache
Iterator urlIter = indexes[0].entries();
Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry;
@ -271,8 +263,8 @@ public class IndexControl_p {
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
urlIter.remove();
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
@ -280,23 +272,17 @@ public class IndexControl_p {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexes[0].remove((String) hashIter.next());
}
// use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000);
result = yacyClient.transferIndex(
yacyCore.seedDB.getConnected(post.get("hostHash", "")),
indexes,
new plasmaWordIndexEntryContainer[]{index},
knownURLs,
"true".equalsIgnoreCase(gzipBody),
timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
indexes[0] = null;
indexes = null;
prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
index = null;
}
// generate list

@ -47,7 +47,6 @@
// if the shell's current path is HTROOT
import java.util.Date;
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
@ -86,16 +85,7 @@ public final class query {
if (obj.equals("rwiurlcount")) {
// the total number of different urls in the rwi is returned
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned
de.anomic.plasma.plasmaWordIndexEntity entity = null;
try {
entity = sb.wordIndex.getEntity(env, true, -1);
prop.put("response", entity.size());
entity.close();
} catch (IOException e) {
prop.put("response", -1);
} finally {
if (entity != null) try { entity.close(); } catch (Exception e) {}
}
prop.put("response", sb.wordIndex.indexSize(env));
return prop;
}

@ -225,22 +225,16 @@ public class plasmaDbImporter extends Thread {
Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true);
while (!isAborted() && importWordHashIterator.hasNext()) {
plasmaWordIndexEntity importWordIdxEntity = null;
plasmaWordIndexEntryContainer newContainer;
try {
wordCounter++;
wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1);
newContainer = importWordIndex.getContainer(wordHash, true, -1);
if (importWordIdxEntity.size() == 0) {
importWordIdxEntity.deleteComplete();
continue;
}
// creating a container used to hold the imported entries
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
if (newContainer.size() == 0) continue;
// the combined container will fit, read the container
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
@ -262,9 +256,6 @@ public class plasmaDbImporter extends Thread {
}
} catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
if (entryCounter % 500 == 0) {
this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far.");
}
@ -277,7 +268,6 @@ public class plasmaDbImporter extends Thread {
homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file
importWordIdxEntity.close();
importWordIndex.deleteIndex(wordHash);
// print out some statistical information
@ -300,7 +290,6 @@ public class plasmaDbImporter extends Thread {
} catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
}
}

@ -551,8 +551,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
getConfig("allowDistributeIndex", "false").equalsIgnoreCase("true"),
getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("true"),
getConfig("indexDistribution.gzipBody","false").equalsIgnoreCase("true"),
(int)getConfigLong("indexDistribution.timeout",60000),
(int)getConfigLong("indexDistribution.maxOpenFiles",800)
(int)getConfigLong("indexDistribution.timeout",60000) /*,
(int)getConfigLong("indexDistribution.maxOpenFiles",800)*/
);
indexDistribution.setCounts(150, 1, 3, 10000);
deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null,
@ -1353,7 +1353,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000);
String error = yacyClient.transferIndex(
seed,
(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]),
urlCache,
true,
120000);
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));

@ -178,6 +178,10 @@ public final class plasmaWordIndex {
return condenser.RESULT_SIMI_WORDS;
}
public int indexSize(String wordHash) {
return ramCache.indexSize(wordHash);
}
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
}

@ -173,6 +173,23 @@ public final class plasmaWordIndexAssortment {
return row2container(wordHash, row);
}
public boolean contains(String wordHash) {
// gets a word index from assortment database
// and returns the content record
byte[][] row = null;
try {
row = assortments.get(wordHash.getBytes());
return (row != null);
} catch (IOException e) {
return false;
} catch (kelondroException e) {
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
+ " - reset assortment-DB " + assortments.file(), e);
resetDatabase();
return false;
}
}
public plasmaWordIndexEntryContainer get(String wordHash) {
// gets a word index from assortment database
// and returns the content record

@ -226,6 +226,14 @@ public final class plasmaWordIndexAssortmentCluster {
return record;
}
public int indexSize(String wordHash) {
int size = 0;
for (int i = 0; i < clusterCount; i++) {
if (assortments[i].contains(wordHash)) size += i + 1;
}
return size;
}
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");

@ -258,6 +258,21 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size()));
}
public int indexSize(String wordHash) {
int size = 0;
try {
plasmaWordIndexEntity entity = backend.getEntity(wordHash, true, -1);
if (entity != null) {
size += entity.size();
entity.close();
}
} catch (IOException e) {}
size += assortmentCluster.indexSize(wordHash);
TreeMap cacheIndex = (TreeMap) cache.get(wordHash);
if (cacheIndex != null) size += cacheIndex.size();
return size;
}
public Iterator wordHashes(String startWordHash, boolean up) {
// Old convention implies rot = true
//return new rotatingWordHashes(startWordHash, up);

@ -47,7 +47,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.HashSet;
import java.util.HashMap;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
@ -77,8 +76,6 @@ public final class plasmaWordIndexDistribution {
private boolean closed;
private boolean gzipBody4Distribution;
private int timeout4Distribution;
private int maxOpenFiles4Distribution;
public transferIndexThread transferIdxThread = null;
public plasmaWordIndexDistribution(
@ -88,8 +85,7 @@ public final class plasmaWordIndexDistribution {
boolean enable,
boolean enabledWhileCrawling,
boolean gzipBody,
int timeout,
int maxOpenFiles
int timeout
) {
this.urlPool = urlPool;
this.wordIndex = wordIndex;
@ -100,7 +96,6 @@ public final class plasmaWordIndexDistribution {
setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000);
this.gzipBody4Distribution = gzipBody;
this.timeout4Distribution = timeout;
this.maxOpenFiles4Distribution = maxOpenFiles;
}
public void enable() {
@ -201,9 +196,8 @@ public final class plasmaWordIndexDistribution {
// collect index
String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution);
Object[] selectResult = selectTransferContainers(startPointHash, indexCount);
plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash);
@ -267,7 +261,12 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted
}
start = System.currentTimeMillis();
error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution);
error = yacyClient.transferIndex(
seeds[i],
indexContainers,
urlCache,
this.gzipBody4Distribution,
this.timeout4Distribution);
if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
@ -285,18 +284,9 @@ public final class plasmaWordIndexDistribution {
if (hc1 >= peerCount) {
// success
if (delete) {
try {
if (deleteTransferIndexes(indexContainers)) {
log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally");
int deletedURLs = deleteTransferIndexes(indexContainers);
log.logFine("Deleted from " + indexContainers.length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
return indexCount;
} else {
log.logSevere("Deleted not all transferred whole-word indexes");
return -1;
}
} catch (IOException ee) {
log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
return -1;
}
} else {
// simply close the indexEntities
closeTransferIndexes(indexContainers);
@ -323,86 +313,67 @@ public final class plasmaWordIndexDistribution {
}
Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferContainers(String hash, int count, int maxOpenFiles) {
selectTransferContainers(String hash, int count) {
// the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(count);
String nexthash = "";
try {
int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity;
plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter;
Iterator hashIter;
plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry lurl;
final HashSet unknownURLEntries = new HashSet();
int notBoundCounter = 0;
final HashMap knownURLs = new HashMap();
while (
(count > 0) &&
(currOpenFiles < maxOpenFiles) &&
(wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) &&
((currOpenFiles == 0) ||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2))
((tmpContainers.size() == 0) ||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer)tmpContainers.get(0)).wordHash()) < 0.2))
) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1);
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
} else {
// make an on-the-fly entity and insert values
indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
indexContainer = this.wordIndex.getContainer(nexthash, true, 10000);
try {
urlIter = indexEntity.elements(true);
unknownURLEntries.clear();
urlIter = indexContainer.entries();
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
notBoundCounter++;
urlIter.remove();
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
indexContainer.add(indexEntry);
count--;
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
notBoundCounter++;
urlIter.remove();
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, true);
this.urlPool.loadedURL.remove(nextUrlHash);
}
// deleting entity if there are no more entries left
// This could occure if there are unknownURLs in the entity
if (indexEntity.size() == 0) {
indexEntity.deleteComplete();
// remove all remaining; we have enough
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlIter.remove();
}
// use whats remaining
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash());
// use whats left
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + this.wordIndex.indexSize(nexthash) +" URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
tmpContainers.add(indexContainer);
} catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e);
indexEntity.deleteComplete();
}
indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards
indexEntity = null;
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + nexthash, e);
this.wordIndex.deleteIndex(nexthash);
}
}
// transfer to array
plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)};
} catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
return new Object[]{entryContainers, knownURLs};
} catch (kelondroException e) {
this.log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -443,13 +414,11 @@ public final class plasmaWordIndexDistribution {
}
}
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException {
int deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
Iterator urlIter;
plasmaWordIndexEntry indexEntry;
plasmaWordIndexEntity indexEntity;
String[] urlHashes;
int sz;
boolean success = true;
int count = 0;
for (int i = 0; i < indexContainers.length; i++) {
// delete entries separately
int c = 0;
@ -459,15 +428,11 @@ public final class plasmaWordIndexDistribution {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash();
}
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1);
sz = indexEntity.size();
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
count += wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].wordHash()) + " entries left");
indexContainers[i] = null;
}
return success;
return count;
}
/*
@ -706,7 +671,6 @@ public final class plasmaWordIndexDistribution {
}
}
}
}
public class transferIndexThread extends Thread {
@ -715,7 +679,6 @@ public final class plasmaWordIndexDistribution {
private boolean finished = false;
private boolean gzipBody4Transfer = false;
private int timeout4Transfer = 60000;
private int maxOpenFiles4Transfer = 800;
private int transferedEntryCount = 0;
private int transferedEntityCount = 0;
private String status = "Running";
@ -734,7 +697,7 @@ public final class plasmaWordIndexDistribution {
this.initialWordsDBSize = sb.wordIndex.size();
this.gzipBody4Transfer = "true".equalsIgnoreCase(sb.getConfig("indexTransfer.gzipBody","false"));
this.timeout4Transfer = (int) sb.getConfigLong("indexTransfer.timeout",60000);
this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800);
//this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800);
}
public void run() {
@ -821,7 +784,6 @@ public final class plasmaWordIndexDistribution {
*/
long selectionStart = System.currentTimeMillis(), selectionEnd = 0, selectionTime = 0, iteration = 0;
Integer openedFiles = new Integer(0);
while (!finished && !Thread.currentThread().isInterrupted()) {
iteration++;
int idxCount = 0;
@ -830,10 +792,9 @@ public final class plasmaWordIndexDistribution {
// selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration;
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue());
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize);
newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2];
/* If we havn't selected a word chunk this could be because of
* a) no words are left in the index
@ -909,17 +870,10 @@ public final class plasmaWordIndexDistribution {
// deleting transfered words from index
if (delete) {
this.status = "Running: Deleting chunk " + iteration;
try {
if (deleteTransferIndexes(oldIndexContainers)) {
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally");
int urlReferences = deleteTransferIndexes(oldIndexContainers);
plasmaWordIndexDistribution.this.log.logFine("Deleted from " + oldIndexContainers.length + " transferred RWIs locally " + urlReferences + " URL references");
transferedEntryCount += idxCount;
transferedEntityCount += oldIndexContainers.length;
} else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
}
} catch (IOException ee) {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
}
} else {
this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount;

@ -112,6 +112,7 @@ public final class plasmaWordIndexEntity {
}
public int size() {
if (theIndex == null) return 0;
int size = theIndex.size();
if ((size == 0) && (delete)) {
deleteComplete();

@ -842,14 +842,23 @@ public final class yacyClient {
return null;
}
}
/*
public static byte[] singleGET(String host, int port, String path, int timeout,
String user, String password,
httpHeader requestHeader) throws IOException {
*/
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next();
if (urlCache.get(entry.getUrlHash()) == null) {
System.out.println("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
}
}
}
// transfer the RWI without the URLs
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; }
String result = (String) in.get("result");
@ -868,7 +877,9 @@ public final class yacyClient {
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
if (urls[i] == null) {
System.out.println("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
}
in = transferURL(targetSeed, urls, gzipBody, timeout);

@ -53,6 +53,7 @@ import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -432,11 +433,13 @@ public final class yacy {
run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb));
// save information about available memory after all initializations
try {
sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory());
sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory());
System.gc();
sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory());
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory());
} catch (ConcurrentModificationException e) {}
// wait for server shutdown
try {
@ -834,22 +837,16 @@ public final class yacy {
// testing if import process was aborted
if (Thread.interrupted()) break;
plasmaWordIndexEntity importWordIdxEntity = null;
plasmaWordIndexEntryContainer newContainer;
try {
wordCounter++;
wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1);
if (importWordIdxEntity.size() == 0) {
importWordIdxEntity.deleteComplete();
continue;
}
newContainer = importWordIndex.getContainer(wordHash, true, -1);
// creating a container used to hold the imported entries
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
if (newContainer.size() == 0) continue;
// the combined container will fit, read the container
Iterator importWordIdxEntries = importWordIdxEntity.elements(true);
Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) {
@ -871,9 +868,6 @@ public final class yacy {
}
} catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
if (entryCounter % 500 == 0) {
log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
}
@ -886,7 +880,6 @@ public final class yacy {
homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file
importWordIdxEntity.close();
importWordIndex.deleteIndex(wordHash);
// print out some statistical information
@ -912,7 +905,6 @@ public final class yacy {
} catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
}
}

Loading…
Cancel
Save