diff --git a/build.properties b/build.properties index 0ef99879d..a2096b21a 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.4 javacTarget=1.4 # Release Configuration -releaseVersion=0.423 +releaseVersion=0.424 releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz #releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr} diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 415cfc12f..f214185a9 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -60,7 +60,6 @@ import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; -import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; @@ -150,22 +149,15 @@ public class IndexControl_p { if (post.containsKey("keyhashdeleteall")) { if (delurl || delurlref) { // generate an urlx array - plasmaWordIndexEntity index = null; - try { - index = switchboard.wordIndex.getEntity(keyhash, true, -1); - Iterator en = index.elements(true); - int i = 0; - urlx = new String[index.size()]; - while (en.hasNext()) { - urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash(); - } - index.close(); - index = null; - } catch (IOException e) { - urlx = new String[0]; - } finally { - if (index != null) try { index.close(); } catch (Exception e) {} + plasmaWordIndexEntryContainer index = null; + index = switchboard.wordIndex.getContainer(keyhash, true, -1); + Iterator en = index.entries(); + int i = 0; + urlx = new String[index.size()]; + while (en.hasNext()) { + urlx[i++] = ((plasmaWordIndexEntry) en.next()).getUrlHash(); } + index = null; } if (delurlref) { for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); @@ -256,12 +248,12 @@ public class IndexControl_p { } prop.put("urlstring", ""); prop.put("urlhash", ""); - plasmaWordIndexEntryContainer[] indexes = new plasmaWordIndexEntryContainer[1]; + plasmaWordIndexEntryContainer index; String result; long starttime = System.currentTimeMillis(); - indexes[0] = switchboard.wordIndex.getContainer(keyhash, true, -1); + index = switchboard.wordIndex.getContainer(keyhash, true, -1); // built urlCache - Iterator urlIter = indexes[0].entries(); + Iterator urlIter = index.entries(); HashMap knownURLs = new HashMap(); HashSet unknownURLEntries = new HashSet(); plasmaWordIndexEntry indexEntry; @@ -271,8 +263,8 @@ public class IndexControl_p { try { lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), null); if (lurl.toString() == null) { - switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash()); + urlIter.remove(); } else { knownURLs.put(indexEntry.getUrlHash(), lurl); } @@ -280,23 +272,17 @@ public class IndexControl_p { unknownURLEntries.add(indexEntry.getUrlHash()); } } - // now delete all entries that have no url entry - Iterator hashIter = unknownURLEntries.iterator(); - while (hashIter.hasNext()) { - indexes[0].remove((String) hashIter.next()); - } // use whats remaining String gzipBody = switchboard.getConfig("indexControl.gzipBody","false"); int timeout = (int) switchboard.getConfigLong("indexControl.timeout",60000); - result = yacyClient.transferIndex ( + result = yacyClient.transferIndex( yacyCore.seedDB.getConnected(post.get("hostHash", "")), - indexes, + new plasmaWordIndexEntryContainer[]{index}, knownURLs, "true".equalsIgnoreCase(gzipBody), timeout); - prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); - indexes[0] = null; - indexes = null; + prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); + index = null; } // generate list diff --git a/htroot/yacy/query.java b/htroot/yacy/query.java index 3f9c39744..b9df98553 100644 --- a/htroot/yacy/query.java +++ b/htroot/yacy/query.java @@ -47,7 +47,6 @@ // if the shell's current path is HTROOT import java.util.Date; -import java.io.IOException; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; @@ -86,16 +85,7 @@ public final class query { if (obj.equals("rwiurlcount")) { // the total number of different urls in the rwi is returned // shall contain a word hash, the number of assigned lurls to this hash is returned - de.anomic.plasma.plasmaWordIndexEntity entity = null; - try { - entity = sb.wordIndex.getEntity(env, true, -1); - prop.put("response", entity.size()); - entity.close(); - } catch (IOException e) { - prop.put("response", -1); - } finally { - if (entity != null) try { entity.close(); } catch (Exception e) {} - } + prop.put("response", sb.wordIndex.indexSize(env)); return prop; } diff --git a/source/de/anomic/plasma/plasmaDbImporter.java b/source/de/anomic/plasma/plasmaDbImporter.java index f570c4a8b..bc0c702ca 100644 --- a/source/de/anomic/plasma/plasmaDbImporter.java +++ b/source/de/anomic/plasma/plasmaDbImporter.java @@ -225,22 +225,16 @@ public class plasmaDbImporter extends Thread { Iterator importWordHashIterator = this.importWordIndex.wordHashes(wordChunkStartHash, true, true); while (!isAborted() && importWordHashIterator.hasNext()) { - plasmaWordIndexEntity importWordIdxEntity = null; + plasmaWordIndexEntryContainer newContainer; try { wordCounter++; wordHash = (String) importWordHashIterator.next(); - importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1); + newContainer = importWordIndex.getContainer(wordHash, true, -1); - if (importWordIdxEntity.size() == 0) { - importWordIdxEntity.deleteComplete(); - continue; - } - - // creating a container used to hold the imported entries - plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size()); + if (newContainer.size() == 0) continue; // the combined container will fit, read the container - Iterator importWordIdxEntries = importWordIdxEntity.elements(true); + Iterator importWordIdxEntries = newContainer.entries(); plasmaWordIndexEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { @@ -262,9 +256,6 @@ public class plasmaDbImporter extends Thread { } } catch (IOException e) {} - // adding word index entity to container - newContainer.add(importWordIdxEntry,System.currentTimeMillis()); - if (entryCounter % 500 == 0) { this.log.logFine(entryCounter + " word entries and " + wordCounter + " word entities processed so far."); } @@ -277,7 +268,6 @@ public class plasmaDbImporter extends Thread { homeWordIndex.addEntries(newContainer, true); // delete complete index entity file - importWordIdxEntity.close(); importWordIndex.deleteIndex(wordHash); // print out some statistical information @@ -300,7 +290,6 @@ public class plasmaDbImporter extends Thread { } catch (Exception e) { log.logSevere("Import of word entity '" + wordHash + "' failed.",e); } finally { - if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {} } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index f4b57214f..3e90e9500 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -551,8 +551,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser getConfig("allowDistributeIndex", "false").equalsIgnoreCase("true"), getConfig("allowDistributeIndexWhileCrawling","false").equalsIgnoreCase("true"), getConfig("indexDistribution.gzipBody","false").equalsIgnoreCase("true"), - (int)getConfigLong("indexDistribution.timeout",60000), - (int)getConfigLong("indexDistribution.maxOpenFiles",800) + (int)getConfigLong("indexDistribution.timeout",60000) /*, + (int)getConfigLong("indexDistribution.maxOpenFiles",800)*/ ); indexDistribution.setCounts(150, 1, 3, 10000); deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, @@ -1353,7 +1353,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser words = condenser.RESULT_SIMI_WORDS; // transfering the index to the storage peer - String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntity[tmpContainers.size()]),urlCache,true,120000); + String error = yacyClient.transferIndex( + seed, + (plasmaWordIndexEntryContainer[])tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]), + urlCache, + true, + 120000); if (error != null) { words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index e3814224d..d81ebd910 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -178,10 +178,14 @@ public final class plasmaWordIndex { return condenser.RESULT_SIMI_WORDS; } + public int indexSize(String wordHash) { + return ramCache.indexSize(wordHash); + } + public plasmaWordIndexEntryContainer getContainer(String wordHash, boolean deleteIfEmpty, long maxTime) { return ramCache.getContainer(wordHash, deleteIfEmpty, maxTime); } - + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty, long maxTime) { return ramCache.getEntity(wordHash, deleteIfEmpty, maxTime); } diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index a4925b817..843e5d6de 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -172,6 +172,23 @@ public final class plasmaWordIndexAssortment { } return row2container(wordHash, row); } + + public boolean contains(String wordHash) { + // gets a word index from assortment database + // and returns the content record + byte[][] row = null; + try { + row = assortments.get(wordHash.getBytes()); + return (row != null); + } catch (IOException e) { + return false; + } catch (kelondroException e) { + log.logSevere("removeAssortment/kelondro-error: " + e.getMessage() + + " - reset assortment-DB " + assortments.file(), e); + resetDatabase(); + return false; + } + } public plasmaWordIndexEntryContainer get(String wordHash) { // gets a word index from assortment database diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java index e1b054255..209d07fc5 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortmentCluster.java @@ -226,6 +226,14 @@ public final class plasmaWordIndexAssortmentCluster { return record; } + public int indexSize(String wordHash) { + int size = 0; + for (int i = 0; i < clusterCount; i++) { + if (assortments[i].contains(wordHash)) size += i + 1; + } + return size; + } + public Iterator hashConjunction(String startWordHash, boolean up, boolean rot) { HashSet iterators = new HashSet(); //if (rot) System.out.println("WARNING: kelondroMergeIterator does not work correctly when individual iterators rotate on their own!"); diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 4e506fe94..2577d13a7 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -258,6 +258,21 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface { return java.lang.Math.max(assortmentCluster.sizeTotal(), java.lang.Math.max(backend.size(), cache.size())); } + public int indexSize(String wordHash) { + int size = 0; + try { + plasmaWordIndexEntity entity = backend.getEntity(wordHash, true, -1); + if (entity != null) { + size += entity.size(); + entity.close(); + } + } catch (IOException e) {} + size += assortmentCluster.indexSize(wordHash); + TreeMap cacheIndex = (TreeMap) cache.get(wordHash); + if (cacheIndex != null) size += cacheIndex.size(); + return size; + } + public Iterator wordHashes(String startWordHash, boolean up) { // Old convention implies rot = true //return new rotatingWordHashes(startWordHash, up); diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index 78e0dcceb..fbff12358 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -47,7 +47,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Enumeration; import java.util.Iterator; -import java.util.HashSet; import java.util.HashMap; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -77,8 +76,6 @@ public final class plasmaWordIndexDistribution { private boolean closed; private boolean gzipBody4Distribution; private int timeout4Distribution; - private int maxOpenFiles4Distribution; - public transferIndexThread transferIdxThread = null; public plasmaWordIndexDistribution( @@ -88,8 +85,7 @@ public final class plasmaWordIndexDistribution { boolean enable, boolean enabledWhileCrawling, boolean gzipBody, - int timeout, - int maxOpenFiles + int timeout ) { this.urlPool = urlPool; this.wordIndex = wordIndex; @@ -100,7 +96,6 @@ public final class plasmaWordIndexDistribution { setCounts(100 /*indexCount*/, 1 /*juniorPeerCount*/, 3 /*seniorPeerCount*/, 8000); this.gzipBody4Distribution = gzipBody; this.timeout4Distribution = timeout; - this.maxOpenFiles4Distribution = maxOpenFiles; } public void enable() { @@ -201,9 +196,8 @@ public final class plasmaWordIndexDistribution { // collect index String startPointHash = selectTransferStart(); log.logFine("Selected hash " + startPointHash + " as start point for index distribution, distance = " + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, startPointHash)); - Object[] selectResult = selectTransferContainers(startPointHash, indexCount, this.maxOpenFiles4Distribution); + Object[] selectResult = selectTransferContainers(startPointHash, indexCount); plasmaWordIndexEntryContainer[] indexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; - //Integer openedFiles = (Integer) selectResult[2]; HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry if ((indexContainers == null) || (indexContainers.length == 0)) { log.logFine("No index available for index transfer, hash start-point " + startPointHash); @@ -267,7 +261,12 @@ public final class plasmaWordIndexDistribution { return -1; // interrupted } start = System.currentTimeMillis(); - error = yacyClient.transferIndex(seeds[i], indexContainers, urlCache, this.gzipBody4Distribution, this.timeout4Distribution); + error = yacyClient.transferIndex( + seeds[i], + indexContainers, + urlCache, + this.gzipBody4Distribution, + this.timeout4Distribution); if (error == null) { log.logInfo("Index transfer of " + indexCount + " words [" + indexContainers[0].wordHash() + " .. " + indexContainers[indexContainers.length - 1].wordHash() + "] to peer " + seeds[i].getName() + ":" + seeds[i].hash + " in " + ((System.currentTimeMillis() - start) / 1000) + " seconds successfull (" + (1000 * indexCount / (System.currentTimeMillis() - start + 1)) + " words/s)"); @@ -285,18 +284,9 @@ public final class plasmaWordIndexDistribution { if (hc1 >= peerCount) { // success if (delete) { - try { - if (deleteTransferIndexes(indexContainers)) { - log.logFine("Deleted all " + indexContainers.length + " transferred whole-word indexes locally"); - return indexCount; - } else { - log.logSevere("Deleted not all transferred whole-word indexes"); - return -1; - } - } catch (IOException ee) { - log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee); - return -1; - } + int deletedURLs = deleteTransferIndexes(indexContainers); + log.logFine("Deleted from " + indexContainers.length + " transferred RWIs locally, removed " + deletedURLs + " URL references"); + return indexCount; } else { // simply close the indexEntities closeTransferIndexes(indexContainers); @@ -323,86 +313,67 @@ public final class plasmaWordIndexDistribution { } Object[] /* of {plasmaWordIndexEntryContainer[], HashMap(String, plasmaCrawlLURL.Entry)}*/ - selectTransferContainers(String hash, int count, int maxOpenFiles) { + selectTransferContainers(String hash, int count) { // the hash is a start hash from where the indexes are picked ArrayList tmpContainers = new ArrayList(count); String nexthash = ""; try { - int currOpenFiles = 0; Iterator wordHashIterator = this.wordIndex.wordHashes(hash, true, true); - plasmaWordIndexEntity indexEntity; plasmaWordIndexEntryContainer indexContainer; Iterator urlIter; - Iterator hashIter; plasmaWordIndexEntry indexEntry; plasmaCrawlLURL.Entry lurl; - final HashSet unknownURLEntries = new HashSet(); + int notBoundCounter = 0; final HashMap knownURLs = new HashMap(); while ( - (count > 0) && - (currOpenFiles < maxOpenFiles) && + (count > 0) && (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0) && - ((currOpenFiles == 0) || - (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntity)tmpContainers.get(0)).wordHash()) < 0.2)) + ((tmpContainers.size() == 0) || + (yacyDHTAction.dhtDistance(nexthash, ((plasmaWordIndexEntryContainer)tmpContainers.get(0)).wordHash()) < 0.2)) ) { - indexEntity = this.wordIndex.getEntity(nexthash, true, -1); - if (indexEntity.size() == 0) { - indexEntity.deleteComplete(); - } else { - // make an on-the-fly entity and insert values - indexContainer = new plasmaWordIndexEntryContainer(indexEntity.wordHash()); + // make an on-the-fly entity and insert values + indexContainer = this.wordIndex.getContainer(nexthash, true, 10000); try { - urlIter = indexEntity.elements(true); - unknownURLEntries.clear(); + urlIter = indexContainer.entries(); + // iterate over indexes to fetch url entries and store them in the urlCache while ((urlIter.hasNext()) && (count > 0)) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); try { lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash(), indexEntry); - if ((lurl == null) || (lurl.url()==null)) { - unknownURLEntries.add(indexEntry.getUrlHash()); + if ((lurl == null) || (lurl.url() == null)) { + notBoundCounter++; + urlIter.remove(); + this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true); } else { knownURLs.put(indexEntry.getUrlHash(), lurl); - indexContainer.add(indexEntry); count--; } } catch (IOException e) { - unknownURLEntries.add(indexEntry.getUrlHash()); + notBoundCounter++; + urlIter.remove(); + this.wordIndex.removeEntries(nexthash, new String[]{indexEntry.getUrlHash()}, true); } } - // now delete all entries that have no url entry - hashIter = unknownURLEntries.iterator(); - while (hashIter.hasNext()) { - String nextUrlHash = (String) hashIter.next(); - indexEntity.removeEntry(nextUrlHash, true); - this.urlPool.loadedURL.remove(nextUrlHash); - } - // deleting entity if there are no more entries left - // This could occure if there are unknownURLs in the entity - if (indexEntity.size() == 0) { - indexEntity.deleteComplete(); - } + // remove all remaining; we have enough + while (urlIter.hasNext()) { + indexEntry = (plasmaWordIndexEntry) urlIter.next(); + urlIter.remove(); + } - // use whats remaining - this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + indexContainer.wordHash()); + // use whats left + this.log.logFine("Selected partial index (" + indexContainer.size() + " from " + this.wordIndex.indexSize(nexthash) +" URLs, " + notBoundCounter + " not bound) for word " + indexContainer.wordHash()); tmpContainers.add(indexContainer); } catch (kelondroException e) { - this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + indexEntity.wordHash(), e); - indexEntity.deleteComplete(); + this.log.logSevere("plasmaWordIndexDistribution/2: deleted DB for word " + nexthash, e); + this.wordIndex.deleteIndex(nexthash); } - indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards - indexEntity = null; - } - } // transfer to array plasmaWordIndexEntryContainer[] entryContainers = (plasmaWordIndexEntryContainer[]) tmpContainers.toArray(new plasmaWordIndexEntryContainer[tmpContainers.size()]); - return new Object[]{entryContainers, knownURLs, new Integer(currOpenFiles)}; - } catch (IOException e) { - this.log.logSevere("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage(), e); - return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)}; + return new Object[]{entryContainers, knownURLs}; } catch (kelondroException e) { this.log.logSevere("selectTransferIndexes database corrupted: " + e.getMessage(), e); return new Object[]{new plasmaWordIndexEntity[0], new HashMap(0)}; @@ -443,13 +414,11 @@ public final class plasmaWordIndexDistribution { } } - boolean deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) throws IOException { + int deleteTransferIndexes(plasmaWordIndexEntryContainer[] indexContainers) { Iterator urlIter; plasmaWordIndexEntry indexEntry; - plasmaWordIndexEntity indexEntity; String[] urlHashes; - int sz; - boolean success = true; + int count = 0; for (int i = 0; i < indexContainers.length; i++) { // delete entries separately int c = 0; @@ -459,15 +428,11 @@ public final class plasmaWordIndexDistribution { indexEntry = (plasmaWordIndexEntry) urlIter.next(); urlHashes[c++] = indexEntry.getUrlHash(); } - wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true); - indexEntity = wordIndex.getEntity(indexContainers[i].wordHash(), true, -1); - sz = indexEntity.size(); - // indexEntity.close(); - closeTransferIndex(indexEntity); - log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + sz + " entries left"); + count += wordIndex.removeEntries(indexContainers[i].wordHash(), urlHashes, true); + log.logFine("Deleted partial index (" + c + " URLs) for word " + indexContainers[i].wordHash() + "; " + this.wordIndex.indexSize(indexContainers[i].wordHash()) + " entries left"); indexContainers[i] = null; } - return success; + return count; } /* @@ -706,7 +671,6 @@ public final class plasmaWordIndexDistribution { } } } - } public class transferIndexThread extends Thread { @@ -715,7 +679,6 @@ public final class plasmaWordIndexDistribution { private boolean finished = false; private boolean gzipBody4Transfer = false; private int timeout4Transfer = 60000; - private int maxOpenFiles4Transfer = 800; private int transferedEntryCount = 0; private int transferedEntityCount = 0; private String status = "Running"; @@ -734,7 +697,7 @@ public final class plasmaWordIndexDistribution { this.initialWordsDBSize = sb.wordIndex.size(); this.gzipBody4Transfer = "true".equalsIgnoreCase(sb.getConfig("indexTransfer.gzipBody","false")); this.timeout4Transfer = (int) sb.getConfigLong("indexTransfer.timeout",60000); - this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800); + //this.maxOpenFiles4Transfer = (int) sb.getConfigLong("indexTransfer.maxOpenFiles",800); } public void run() { @@ -821,7 +784,6 @@ public final class plasmaWordIndexDistribution { */ long selectionStart = System.currentTimeMillis(), selectionEnd = 0, selectionTime = 0, iteration = 0; - Integer openedFiles = new Integer(0); while (!finished && !Thread.currentThread().isInterrupted()) { iteration++; int idxCount = 0; @@ -830,10 +792,9 @@ public final class plasmaWordIndexDistribution { // selecting 500 words to transfer this.status = "Running: Selecting chunk " + iteration; - Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize, this.maxOpenFiles4Transfer - openedFiles.intValue()); + Object[] selectResult = selectTransferContainers(this.startPointHash, this.chunkSize); newIndexContainers = (plasmaWordIndexEntryContainer[]) selectResult[0]; HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry - openedFiles = (Integer) selectResult[2]; /* If we havn't selected a word chunk this could be because of * a) no words are left in the index @@ -909,17 +870,10 @@ public final class plasmaWordIndexDistribution { // deleting transfered words from index if (delete) { this.status = "Running: Deleting chunk " + iteration; - try { - if (deleteTransferIndexes(oldIndexContainers)) { - plasmaWordIndexDistribution.this.log.logFine("Deleted all " + oldIndexContainers.length + " transferred whole-word indexes locally"); - transferedEntryCount += idxCount; - transferedEntityCount += oldIndexContainers.length; - } else { - plasmaWordIndexDistribution.this.log.logSevere("Deleted not all transferred whole-word indexes"); - } - } catch (IOException ee) { - plasmaWordIndexDistribution.this.log.logSevere("Deletion of indexes not possible:" + ee.getMessage(), ee); - } + int urlReferences = deleteTransferIndexes(oldIndexContainers); + plasmaWordIndexDistribution.this.log.logFine("Deleted from " + oldIndexContainers.length + " transferred RWIs locally " + urlReferences + " URL references"); + transferedEntryCount += idxCount; + transferedEntityCount += oldIndexContainers.length; } else { this.closeContainers(oldIndexContainers); transferedEntryCount += idxCount; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 969ec72a4..6396ba18c 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -112,6 +112,7 @@ public final class plasmaWordIndexEntity { } public int size() { + if (theIndex == null) return 0; int size = theIndex.size(); if ((size == 0) && (delete)) { deleteComplete(); diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 76d550912..47149817f 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -842,14 +842,23 @@ public final class yacyClient { return null; } } - /* - public static byte[] singleGET(String host, int port, String path, int timeout, - String user, String password, - httpHeader requestHeader) throws IOException { - */ public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntryContainer[] indexes, HashMap urlCache, boolean gzipBody, int timeout) { + // check if we got all necessary urls in the urlCache (only for debugging) + Iterator eenum; + plasmaWordIndexEntry entry; + for (int i = 0; i < indexes.length; i++) { + eenum = indexes[i].entries(); + while (eenum.hasNext()) { + entry = (plasmaWordIndexEntry) eenum.next(); + if (urlCache.get(entry.getUrlHash()) == null) { + System.out.println("DEBUG transferIndex: to-send url hash '" + entry.getUrlHash() + "' is not contained in urlCache"); + } + } + } + + // transfer the RWI without the URLs HashMap in = transferRWI(targetSeed, indexes, gzipBody, timeout); if (in == null) { return "no_connection_1"; } String result = (String) in.get("result"); @@ -868,7 +877,9 @@ public final class yacyClient { plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; for (int i = 0; i < uhs.length; i++) { urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); - if (urls[i] == null) System.out.println("DEBUG transferIndex: error with requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); + if (urls[i] == null) { + System.out.println("DEBUG transferIndex: requested url hash '" + uhs[i] + "', unknownURL='" + uhss + "'"); + } } in = transferURL(targetSeed, urls, gzipBody, timeout); diff --git a/source/yacy.java b/source/yacy.java index c59ca86e8..0901de4f2 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -53,6 +53,7 @@ import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.URL; +import java.util.ConcurrentModificationException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -432,11 +433,13 @@ public final class yacy { run.addShutdownHook(new shutdownHookThread(Thread.currentThread(), sb)); // save information about available memory after all initializations - sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory()); - sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory()); - System.gc(); - sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory()); - sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory()); + try { + sb.setConfig("memoryFreeAfterInitBGC", Runtime.getRuntime().freeMemory()); + sb.setConfig("memoryTotalAfterInitBGC", Runtime.getRuntime().totalMemory()); + System.gc(); + sb.setConfig("memoryFreeAfterInitAGC", Runtime.getRuntime().freeMemory()); + sb.setConfig("memoryTotalAfterInitAGC", Runtime.getRuntime().totalMemory()); + } catch (ConcurrentModificationException e) {} // wait for server shutdown try { @@ -834,22 +837,16 @@ public final class yacy { // testing if import process was aborted if (Thread.interrupted()) break; - plasmaWordIndexEntity importWordIdxEntity = null; + plasmaWordIndexEntryContainer newContainer; try { wordCounter++; wordHash = (String) importWordHashIterator.next(); - importWordIdxEntity = importWordIndex.getEntity(wordHash, true, -1); - - if (importWordIdxEntity.size() == 0) { - importWordIdxEntity.deleteComplete(); - continue; - } + newContainer = importWordIndex.getContainer(wordHash, true, -1); - // creating a container used to hold the imported entries - plasmaWordIndexEntryContainer newContainer = new plasmaWordIndexEntryContainer(wordHash,importWordIdxEntity.size()); + if (newContainer.size() == 0) continue; // the combined container will fit, read the container - Iterator importWordIdxEntries = importWordIdxEntity.elements(true); + Iterator importWordIdxEntries = newContainer.entries(); plasmaWordIndexEntry importWordIdxEntry; while (importWordIdxEntries.hasNext()) { @@ -871,9 +868,6 @@ public final class yacy { } } catch (IOException e) {} - // adding word index entity to container - newContainer.add(importWordIdxEntry,System.currentTimeMillis()); - if (entryCounter % 500 == 0) { log.logFine(entryCounter + " word entries and " + wordCounter + " word entries processed so far."); } @@ -886,7 +880,6 @@ public final class yacy { homeWordIndex.addEntries(newContainer, true); // delete complete index entity file - importWordIdxEntity.close(); importWordIndex.deleteIndex(wordHash); // print out some statistical information @@ -912,7 +905,6 @@ public final class yacy { } catch (Exception e) { log.logSevere("Import of word entity '" + wordHash + "' failed.",e); } finally { - if (importWordIdxEntity != null) try { importWordIdxEntity.close(); } catch (Exception e) {} } }