- removed some usage of indexEntity

- changed index collection process: indexes are not first flushed to indexEntity,
  but now collected directly from ram cache and assortments

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1489 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 09dc7bbcd7
commit fa90c3ca7a

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4 javacTarget=1.4
# Release Configuration # Release Configuration
releaseVersion=0.423 releaseVersion=0.424
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -60,7 +60,6 @@ import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntity;
import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaWordIndexEntryContainer;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -150,22 +149,15 @@ public class IndexControl_p {
if (post.containsKey("keyhashdeleteall")) { if (post.containsKey("keyhashdeleteall")) {
if (delurl || delurlref) { if (delurl || delurlref) {
// generate an urlx array // generate an urlx array
plasmaWordIndexEntity index = null; plasmaWordIndexEntryContainer index = null;
try { index = switchboard.wordIndex.getContainer(keyhash, true, -1);
index = switchboard.wordIndex.getEntity(keyhash, true, -1); Iterator en = index.entries();
Iterator en = index.elements(true); int i = 0;
int i = 0; urlx = new String[index.size()];
urlx = new String[index.size()]; while (en.hasNext()) {
while (en.hasNext()) { urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash();
}
index.close();
index = null;
} catch (IOException e) {
urlx = new String[0];
} finally {
if (index != null) try { index.close(); } catch (Exception e) {}
} }
index = null;
} }
if (delurlref) { if (delurlref) {
for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
@ -256,12 +248,12 @@ public class IndexControl_p {
} }
prop.put("urlstring", ""); prop.put("urlstring", "");
prop.put("urlhash", ""); prop.put("urlhash", "");
plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1]; plasmaWordIndexEntryContainer index;
String result; String result;
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1); index = switchboard.wordIndex.getContainer(keyhash, true, -1);
// built urlCache // built urlCache
Iterator urlIter = indexes[0].entries(); Iterator urlIter = index.entries();
HashMap knownURLs = new HashMap(); HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet(); HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
@ -271,8 +263,8 @@ public class IndexControl_p {
try { try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null);
if (lurl.toString() == null) { if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
urlIter.remove();
} else { } else {
knownURLs.put(indexEntry.getUrlHash(), lurl); knownURLs.put(indexEntry.getUrlHash(), lurl);
} }
@ -280,23 +272,17 @@ public class IndexControl_p {
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
} }
} }
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexes[0].remove((String) hashIter.next());
}
// use whats remaining // use whats remaining
String gzipBody = switchboard.getConfig("indexControl.gzipBody","false"); String gzipBody = switchboard.getConfig("indexControl.gzipBody","false");
int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000); int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000);
result = yacyClient.transferIndex ( result = yacyClient.transferIndex(
yacyCore.seedDB.getConnected(post.get("hostHash", "")), yacyCore.seedDB.getConnected(post.get("hostHash", "")),
indexes, new plasmaWordIndexEntryContainer[]{index},
knownURLs, knownURLs,
"true".equalsIgnoreCase(gzipBody), "true".equalsIgnoreCase(gzipBody),
timeout); timeout);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
indexes[0] = null; index = null;
indexes = null;
} }
// generate list // generate list

@ -47,7 +47,6 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.util.Date; import java.util.Date;
import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -86,16 +85,7 @@ public final class query {
if (obj.equals("rwiurlcount")) { if (obj.equals("rwiurlcount")) {
// the total number of different urls in the rwi is returned // the total number of different urls in the rwi is returned
// <env> shall contain a word hash, the number of assigned lurls to this hash is returned // <env> shall contain a word hash, the number of assigned lurls to this hash is returned
de.anomic.plasma.plasmaWordIndexEntity entity = null; prop.put("response", sb.wordIndex.indexSize(env));
try {
entity = sb.wordIndex.getEntity(env, true, -1);
prop.put("response", entity.size());
entity.close();
} catch (IOException e) {
prop.put("response", -1);
} finally {
if (entity != null) try { entity.close(); } catch (Exception e) {}
}
return prop; return prop;
} }

@ -225,22 +225,16 @@ public class plasmaDbImporter extends Thread {
Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true); Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true);
while (!isAborted() && importWordHashIterator.hasNext()) { while (!isAborted() && importWordHashIterator.hasNext()) {
plasmaWordIndexEntity importWordIdxEntity = null; plasmaWordIndexEntryContainer newContainer;
try { try {
wordCounter++; wordCounter++;
wordHash = (String) importWordHashIterator.next(); wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1); newContainer = importWordIndex.getContainer(wordHash, true, -1);
if (importWordIdxEntity.size() == 0) { if (newContainer.size() == 0) continue;
importWordIdxEntity.deleteComplete();
continue;
}
// creating a container used to hold the imported entries
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
// the combined container will fit, read the container // the combined container will fit, read the container
Iterator importWordIdxEntries = importWordIdxEntity.elements(true); Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntry importWordIdxEntry; plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
@ -262,9 +256,6 @@ public class plasmaDbImporter extends Thread {
} }
} catch (IOException e) {} } catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
if (entryCounter % 500 == 0) { if (entryCounter % 500 == 0) {
this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far."); this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far.");
} }
@ -277,7 +268,6 @@ public class plasmaDbImporter extends Thread {
homeWordIndex.addEntries(newContainer, true); homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file // delete complete index entity file
importWordIdxEntity.close();
importWordIndex.deleteIndex(wordHash); importWordIndex.deleteIndex(wordHash);
// print out some statistical information // print out some statistical information
@ -300,7 +290,6 @@ public class plasmaDbImporter extends Thread {
} catch (Exception e) { } catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e); log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally { } finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
} }
} }

@ -551,8 +551,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
getConfig("allowDistributeIndex", "false").equalsIgnoreCase("true"), getConfig("allowDistributeIndex", "false").equalsIgnoreCase("true"),
getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("true"), getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("true"),
getConfig("indexDistribution.gzipBody","false").equalsIgnoreCase("true"), getConfig("indexDistribution.gzipBody","false").equalsIgnoreCase("true"),
(int)getConfigLong("indexDistribution.timeout",60000), (int)getConfigLong("indexDistribution.timeout",60000) /*,
(int)getConfigLong("indexDistribution.maxOpenFiles",800) (int)getConfigLong("indexDistribution.maxOpenFiles",800)*/
); );
indexDistribution.setCounts(150, 1, 3, 10000); indexDistribution.setCounts(150, 1, 3, 10000);
deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null,
@ -1353,7 +1353,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
words = condenser.RESULT_SIMI_WORDS; words = condenser.RESULT_SIMI_WORDS;
// transfering the index to the storage peer // transfering the index to the storage peer
String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000); String error = yacyClient.transferIndex(
seed,
(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]),
urlCache,
true,
120000);
if (error != null) { if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()));

@ -178,6 +178,10 @@ public final class plasmaWordIndex {
return condenser.RESULT_SIMI_WORDS; return condenser.RESULT_SIMI_WORDS;
} }
public int indexSize(String wordHash) {
return ramCache.indexSize(wordHash);
}
public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) {
return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime); return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime);
} }

@ -173,6 +173,23 @@ public final class plasmaWordIndexAssortment {
return row2container(wordHash, row); return row2container(wordHash, row);
} }
public boolean contains(String wordHash) {
// gets a word index from assortment database
// and returns the content record
byte[][] row = null;
try {
row = assortments.get(wordHash.getBytes());
return (row != null);
} catch (IOException e) {
return false;
} catch (kelondroException e) {
log.logSevere("removeAssortment/kelondro-error: " + e.getMessage()
+ " - reset assortment-DB " + assortments.file(), e);
resetDatabase();
return false;
}
}
public plasmaWordIndexEntryContainer get(String wordHash) { public plasmaWordIndexEntryContainer get(String wordHash) {
// gets a word index from assortment database // gets a word index from assortment database
// and returns the content record // and returns the content record

@ -226,6 +226,14 @@ public final class plasmaWordIndexAssortmentCluster {
return record; return record;
} }
public int indexSize(String wordHash) {
int size = 0;
for (int i = 0; i < clusterCount; i++) {
if (assortments[i].contains(wordHash)) size += i + 1;
}
return size;
}
public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) { public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) {
HashSet iterators = new HashSet(); HashSet iterators = new HashSet();
//if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!"); //if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!");

@ -258,6 +258,21 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size())); return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size()));
} }
public int indexSize(String wordHash) {
int size = 0;
try {
plasmaWordIndexEntity entity = backend.getEntity(wordHash, true, -1);
if (entity != null) {
size += entity.size();
entity.close();
}
} catch (IOException e) {}
size += assortmentCluster.indexSize(wordHash);
TreeMap cacheIndex = (TreeMap) cache.get(wordHash);
if (cacheIndex != null) size += cacheIndex.size();
return size;
}
public Iterator wordHashes(String startWordHash, boolean up) { public Iterator wordHashes(String startWordHash, boolean up) {
// Old convention implies rot = true // Old convention implies rot = true
//return new rotatingWordHashes(startWordHash, up); //return new rotatingWordHashes(startWordHash, up);

@ -47,7 +47,6 @@ import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.Iterator; import java.util.Iterator;
import java.util.HashSet;
import java.util.HashMap; import java.util.HashMap;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeed;
@ -77,8 +76,6 @@ public final class plasmaWordIndexDistribution {
private boolean closed; private boolean closed;
private boolean gzipBody4Distribution; private boolean gzipBody4Distribution;
private int timeout4Distribution; private int timeout4Distribution;
private int maxOpenFiles4Distribution;
public transferIndexThread transferIdxThread = null; public transferIndexThread transferIdxThread = null;
public plasmaWordIndexDistribution( public plasmaWordIndexDistribution(
@ -88,8 +85,7 @@ public final class plasmaWordIndexDistribution {
boolean enable, boolean enable,
boolean enabledWhileCrawling, boolean enabledWhileCrawling,
boolean gzipBody, boolean gzipBody,
int timeout, int timeout
int maxOpenFiles
) { ) {
this.urlPool = urlPool; this.urlPool = urlPool;
this.wordIndex = wordIndex; this.wordIndex = wordIndex;
@ -100,7 +96,6 @@ public final class plasmaWordIndexDistribution {
setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000); setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000);
this.gzipBody4Distribution = gzipBody; this.gzipBody4Distribution = gzipBody;
this.timeout4Distribution = timeout; this.timeout4Distribution = timeout;
this.maxOpenFiles4Distribution = maxOpenFiles;
} }
public void enable() { public void enable() {
@ -201,9 +196,8 @@ public final class plasmaWordIndexDistribution {
// collect index // collect index
String startPointHash = selectTransferStart(); String startPointHash = selectTransferStart();
log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash)); log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash));
Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution); Object[] selectResult = selectTransferContainers(startPointHash, indexCount);
plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
//Integer openedFiles = (Integer) selectResult[2];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexContainers == null) || (indexContainers.length == 0)) { if ((indexContainers == null) || (indexContainers.length == 0)) {
log.logFine("No index available for index transfer, hash start-point " + startPointHash); log.logFine("No index available for index transfer, hash start-point " + startPointHash);
@ -267,7 +261,12 @@ public final class plasmaWordIndexDistribution {
return -1; // interrupted return -1; // interrupted
} }
start = System.currentTimeMillis(); start = System.currentTimeMillis();
error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution); error = yacyClient.transferIndex(
seeds[i],
indexContainers,
urlCache,
this.gzipBody4Distribution,
this.timeout4Distribution);
if (error == null) { if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000) log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000)
+ " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)"); + " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)");
@ -285,18 +284,9 @@ public final class plasmaWordIndexDistribution {
if (hc1 >= peerCount) { if (hc1 >= peerCount) {
// success // success
if (delete) { if (delete) {
try { int deletedURLs = deleteTransferIndexes(indexContainers);
if (deleteTransferIndexes(indexContainers)) { log.logFine("Deleted from " + indexContainers.length + " transferred RWIs locally, removed " + deletedURLs + " URL references");
log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally"); return indexCount;
return indexCount;
} else {
log.logSevere("Deleted not all transferred whole-word indexes");
return -1;
}
} catch (IOException ee) {
log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
return -1;
}
} else { } else {
// simply close the indexEntities // simply close the indexEntities
closeTransferIndexes(indexContainers); closeTransferIndexes(indexContainers);
@ -323,86 +313,67 @@ public final class plasmaWordIndexDistribution {
} }
Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/ Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferContainers(String hash, int count, int maxOpenFiles) { selectTransferContainers(String hash, int count) {
// the hash is a start hash from where the indexes are picked // the hash is a start hash from where the indexes are picked
ArrayList tmpContainers = new ArrayList(count); ArrayList tmpContainers = new ArrayList(count);
String nexthash = ""; String nexthash = "";
try { try {
int currOpenFiles = 0;
Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true); Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity;
plasmaWordIndexEntryContainer indexContainer; plasmaWordIndexEntryContainer indexContainer;
Iterator urlIter; Iterator urlIter;
Iterator hashIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURL.Entry lurl;
final HashSet unknownURLEntries = new HashSet(); int notBoundCounter = 0;
final HashMap knownURLs = new HashMap(); final HashMap knownURLs = new HashMap();
while ( while (
(count > 0) && (count > 0) &&
(currOpenFiles < maxOpenFiles) &&
(wordHashIterator.hasNext()) && (wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) && ((nexthash = (String) wordHashIterator.next()) != null) &&
(nexthash.trim().length() > 0) && (nexthash.trim().length() > 0) &&
((currOpenFiles == 0) || ((tmpContainers.size() == 0) ||
(yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2)) (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer)tmpContainers.get(0)).wordHash()) < 0.2))
) { ) {
indexEntity = this.wordIndex.getEntity(nexthash, true, -1); // make an on-the-fly entity and insert values
if (indexEntity.size() == 0) { indexContainer = this.wordIndex.getContainer(nexthash, true, 10000);
indexEntity.deleteComplete();
} else {
// make an on-the-fly entity and insert values
indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash());
try { try {
urlIter = indexEntity.elements(true); urlIter = indexContainer.entries();
unknownURLEntries.clear(); // iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (count > 0)) { while ((urlIter.hasNext()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlIter.next(); indexEntry = (plasmaWordIndexEntry) urlIter.next();
try { try {
lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry); lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry);
if ((lurl == null) || (lurl.url()==null)) { if ((lurl == null) || (lurl.url() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash()); notBoundCounter++;
urlIter.remove();
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
} else { } else {
knownURLs.put(indexEntry.getUrlHash(), lurl); knownURLs.put(indexEntry.getUrlHash(), lurl);
indexContainer.add(indexEntry);
count--; count--;
} }
} catch (IOException e) { } catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash()); notBoundCounter++;
urlIter.remove();
this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true);
} }
} }
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
String nextUrlHash = (String) hashIter.next();
indexEntity.removeEntry(nextUrlHash, true);
this.urlPool.loadedURL.remove(nextUrlHash);
}
// deleting entity if there are no more entries left // remove all remaining; we have enough
// This could occure if there are unknownURLs in the entity while (urlIter.hasNext()) {
if (indexEntity.size() == 0) { indexEntry = (plasmaWordIndexEntry) urlIter.next();
indexEntity.deleteComplete(); urlIter.remove();
} }
// use whats remaining // use whats left
this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash()); this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + this.wordIndex.indexSize(nexthash) +" URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash());
tmpContainers.add(indexContainer); tmpContainers.add(indexContainer);
} catch (kelondroException e) { } catch (kelondroException e) {
this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e); this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + nexthash, e);
indexEntity.deleteComplete(); this.wordIndex.deleteIndex(nexthash);
} }
indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards
indexEntity = null;
}
} }
// transfer to array // transfer to array
plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]);
return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)}; return new Object[]{entryContainers, knownURLs};
} catch (IOException e) {
this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
} catch (kelondroException e) { } catch (kelondroException e) {
this.log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); this.log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e);
return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)}; return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)};
@ -443,13 +414,11 @@ public final class plasmaWordIndexDistribution {
} }
} }
boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException { int deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) {
Iterator urlIter; Iterator urlIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
plasmaWordIndexEntity indexEntity;
String[] urlHashes; String[] urlHashes;
int sz; int count = 0;
boolean success = true;
for (int i = 0; i < indexContainers.length; i++) { for (int i = 0; i < indexContainers.length; i++) {
// delete entries separately // delete entries separately
int c = 0; int c = 0;
@ -459,15 +428,11 @@ public final class plasmaWordIndexDistribution {
indexEntry = (plasmaWordIndexEntry) urlIter.next(); indexEntry = (plasmaWordIndexEntry) urlIter.next();
urlHashes[c++] = indexEntry.getUrlHash(); urlHashes[c++] = indexEntry.getUrlHash();
} }
wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true); count += wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true);
indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1); log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].wordHash()) + " entries left");
sz = indexEntity.size();
// indexEntity.close();
closeTransferIndex(indexEntity);
log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left");
indexContainers[i] = null; indexContainers[i] = null;
} }
return success; return count;
} }
/* /*
@ -706,7 +671,6 @@ public final class plasmaWordIndexDistribution {
} }
} }
} }
} }
public class transferIndexThread extends Thread { public class transferIndexThread extends Thread {
@ -715,7 +679,6 @@ public final class plasmaWordIndexDistribution {
private boolean finished = false; private boolean finished = false;
private boolean gzipBody4Transfer = false; private boolean gzipBody4Transfer = false;
private int timeout4Transfer = 60000; private int timeout4Transfer = 60000;
private int maxOpenFiles4Transfer = 800;
private int transferedEntryCount = 0; private int transferedEntryCount = 0;
private int transferedEntityCount = 0; private int transferedEntityCount = 0;
private String status = "Running"; private String status = "Running";
@ -734,7 +697,7 @@ public final class plasmaWordIndexDistribution {
this.initialWordsDBSize = sb.wordIndex.size(); this.initialWordsDBSize = sb.wordIndex.size();
this.gzipBody4Transfer = "true".equalsIgnoreCase(sb.getConfig("indexTransfer.gzipBody","false")); this.gzipBody4Transfer = "true".equalsIgnoreCase(sb.getConfig("indexTransfer.gzipBody","false"));
this.timeout4Transfer = (int) sb.getConfigLong("indexTransfer.timeout",60000); this.timeout4Transfer = (int) sb.getConfigLong("indexTransfer.timeout",60000);
this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800); //this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800);
} }
public void run() { public void run() {
@ -821,7 +784,6 @@ public final class plasmaWordIndexDistribution {
*/ */
long selectionStart = System.currentTimeMillis(), selectionEnd = 0, selectionTime = 0, iteration = 0; long selectionStart = System.currentTimeMillis(), selectionEnd = 0, selectionTime = 0, iteration = 0;
Integer openedFiles = new Integer(0);
while (!finished && !Thread.currentThread().isInterrupted()) { while (!finished && !Thread.currentThread().isInterrupted()) {
iteration++; iteration++;
int idxCount = 0; int idxCount = 0;
@ -830,10 +792,9 @@ public final class plasmaWordIndexDistribution {
// selecting 500 words to transfer // selecting 500 words to transfer
this.status = "Running: Selecting chunk " + iteration; this.status = "Running: Selecting chunk " + iteration;
Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue()); Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize);
newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
openedFiles = (Integer) selectResult[2];
/* If we havn't selected a word chunk this could be because of /* If we havn't selected a word chunk this could be because of
* a) no words are left in the index * a) no words are left in the index
@ -909,17 +870,10 @@ public final class plasmaWordIndexDistribution {
// deleting transfered words from index // deleting transfered words from index
if (delete) { if (delete) {
this.status = "Running: Deleting chunk " + iteration; this.status = "Running: Deleting chunk " + iteration;
try { int urlReferences = deleteTransferIndexes(oldIndexContainers);
if (deleteTransferIndexes(oldIndexContainers)) { plasmaWordIndexDistribution.this.log.logFine("Deleted from " + oldIndexContainers.length + " transferred RWIs locally " + urlReferences + " URL references");
plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally"); transferedEntryCount += idxCount;
transferedEntryCount += idxCount; transferedEntityCount += oldIndexContainers.length;
transferedEntityCount += oldIndexContainers.length;
} else {
plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes");
}
} catch (IOException ee) {
plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee);
}
} else { } else {
this.closeContainers(oldIndexContainers); this.closeContainers(oldIndexContainers);
transferedEntryCount += idxCount; transferedEntryCount += idxCount;

@ -112,6 +112,7 @@ public final class plasmaWordIndexEntity {
} }
public int size() { public int size() {
if (theIndex == null) return 0;
int size = theIndex.size(); int size = theIndex.size();
if ((size == 0) && (delete)) { if ((size == 0) && (delete)) {
deleteComplete(); deleteComplete();

@ -842,14 +842,23 @@ public final class yacyClient {
return null; return null;
} }
} }
/*
public static byte[] singleGET(String host, int port, String path, int timeout,
String user, String password,
httpHeader requestHeader) throws IOException {
*/
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) {
// check if we got all necessary urls in the urlCache (only for debugging)
Iterator eenum;
plasmaWordIndexEntry entry;
for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].entries();
while (eenum.hasNext()) {
entry = (plasmaWordIndexEntry) eenum.next();
if (urlCache.get(entry.getUrlHash()) == null) {
System.out.println("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache");
}
}
}
// transfer the RWI without the URLs
HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout); HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout);
if (in == null) { return "no_connection_1"; } if (in == null) { return "no_connection_1"; }
String result = (String) in.get("result"); String result = (String) in.get("result");
@ -868,7 +877,9 @@ public final class yacyClient {
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
for (int i = 0; i < uhs.length; i++) { for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); if (urls[i] == null) {
System.out.println("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'");
}
} }
in = transferURL(targetSeed, urls, gzipBody, timeout); in = transferURL(targetSeed, urls, gzipBody, timeout);

@ -53,6 +53,7 @@ import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.io.PrintWriter; import java.io.PrintWriter;
import java.net.URL; import java.net.URL;
import java.util.ConcurrentModificationException;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -432,11 +433,13 @@ public final class yacy {
run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb)); run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb));
// save information about available memory after all initializations // save information about available memory after all initializations
sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory()); try {
sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory()); sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory());
System.gc(); sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory());
sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory()); System.gc();
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory()); sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory());
sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory());
} catch (ConcurrentModificationException e) {}
// wait for server shutdown // wait for server shutdown
try { try {
@ -834,22 +837,16 @@ public final class yacy {
// testing if import process was aborted // testing if import process was aborted
if (Thread.interrupted()) break; if (Thread.interrupted()) break;
plasmaWordIndexEntity importWordIdxEntity = null; plasmaWordIndexEntryContainer newContainer;
try { try {
wordCounter++; wordCounter++;
wordHash = (String) importWordHashIterator.next(); wordHash = (String) importWordHashIterator.next();
importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1); newContainer = importWordIndex.getContainer(wordHash, true, -1);
if (importWordIdxEntity.size() == 0) {
importWordIdxEntity.deleteComplete();
continue;
}
// creating a container used to hold the imported entries if (newContainer.size() == 0) continue;
plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size());
// the combined container will fit, read the container // the combined container will fit, read the container
Iterator importWordIdxEntries = importWordIdxEntity.elements(true); Iterator importWordIdxEntries = newContainer.entries();
plasmaWordIndexEntry importWordIdxEntry; plasmaWordIndexEntry importWordIdxEntry;
while (importWordIdxEntries.hasNext()) { while (importWordIdxEntries.hasNext()) {
@ -871,9 +868,6 @@ public final class yacy {
} }
} catch (IOException e) {} } catch (IOException e) {}
// adding word index entity to container
newContainer.add(importWordIdxEntry,System.currentTimeMillis());
if (entryCounter % 500 == 0) { if (entryCounter % 500 == 0) {
log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far."); log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far.");
} }
@ -886,7 +880,6 @@ public final class yacy {
homeWordIndex.addEntries(newContainer, true); homeWordIndex.addEntries(newContainer, true);
// delete complete index entity file // delete complete index entity file
importWordIdxEntity.close();
importWordIndex.deleteIndex(wordHash); importWordIndex.deleteIndex(wordHash);
// print out some statistical information // print out some statistical information
@ -912,7 +905,6 @@ public final class yacy {
} catch (Exception e) { } catch (Exception e) {
log.logSevere("Import of word entity '" + wordHash + "' failed.",e); log.logSevere("Import of word entity '" + wordHash + "' failed.",e);
} finally { } finally {
if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {}
} }
} }

Loading…
Cancel
Save