From 138422990a47792db579d2730f2057177123dcaa Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 22 Apr 2009 22:39:12 +0000 Subject: [PATCH] - removed useCell option: the indexCell data structure is now the default index structure; old collection data is still migrated - added some debugging output to balancer to find a bug - removed unused classes for index collection handling - changed some default values for the process handling: more memory needed to prevent OOM git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5856 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- build.properties | 2 +- defaults/performance_dht.profile | 28 +- defaults/yacy.init | 35 +- source/de/anomic/crawler/Balancer.java | 3 +- source/de/anomic/crawler/IndexingStack.java | 10 +- source/de/anomic/crawler/NoticedURL.java | 2 + source/de/anomic/kelondro/table/Stack.java | 16 +- .../text/BufferedIndexCollection.java | 344 ----------------- .../de/anomic/kelondro/text/IndexBuffer.java | 348 ------------------ .../anomic/kelondro/text/IndexCollection.java | 2 +- .../text/IndexCollectionMigration.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- source/de/anomic/plasma/plasmaWordIndex.java | 21 +- source/de/anomic/yacy/dht/Dispatcher.java | 1 + source/yacy.java | 4 +- 15 files changed, 65 insertions(+), 761 deletions(-) delete mode 100644 source/de/anomic/kelondro/text/BufferedIndexCollection.java delete mode 100644 source/de/anomic/kelondro/text/IndexBuffer.java diff --git a/build.properties b/build.properties index d4d024152..fad27b6a3 100644 --- a/build.properties +++ b/build.properties @@ -3,7 +3,7 @@ javacSource=1.5 javacTarget=1.5 # Release Configuration -releaseVersion=0.75 +releaseVersion=0.76 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz diff --git a/defaults/performance_dht.profile b/defaults/performance_dht.profile index 8a4cafbaa..8aa6d42cc 100644 --- a/defaults/performance_dht.profile +++ b/defaults/performance_dht.profile @@ -4,33 +4,31 @@ # performance-settings # delay-times for permanent loops (milliseconds) -# the idlesleep is the pause that an proces sleeps if the last call to the +# the idlesleep is the pause that an process sleeps if the last call to the # process job was without execution of anything; # the busysleep is the pause after a full job execution # the prereq-value is a memory pre-requisite: that much bytes must # be available/free in the heap; othervise the loop is not executed # and another idlesleep is performed + 20_dhtdistribution_idlesleep=5000 20_dhtdistribution_busysleep=2000 -20_dhtdistribution_memprereq=6291456 +20_dhtdistribution_memprereq=12582912 50_localcrawl_idlesleep=4000 -50_localcrawl_busysleep=500 -50_localcrawl_memprereq=4194304 +50_localcrawl_busysleep=50 +50_localcrawl_memprereq=12582912 50_localcrawl_isPaused=false -60_remotecrawlloader_idlesleep=60000 -60_remotecrawlloader_busysleep=40000 -60_remotecrawlloader_memprereq=2097152 +60_remotecrawlloader_idlesleep=120000 +60_remotecrawlloader_busysleep=60000 +60_remotecrawlloader_memprereq=12582912 60_remotecrawlloader_isPaused=false -62_remotetriggeredcrawl_idlesleep=10000 -62_remotetriggeredcrawl_busysleep=1000 -62_remotetriggeredcrawl_memprereq=6291456 +62_remotetriggeredcrawl_idlesleep=60000 +62_remotetriggeredcrawl_busysleep=10000 +62_remotetriggeredcrawl_memprereq=12582912 62_remotetriggeredcrawl_isPaused=false 80_indexing_idlesleep=1000 -80_indexing_busysleep=100 -80_indexing_memprereq=6291456 -85_cacheflush_idlesleep=120000 -85_cacheflush_busysleep=60000 -85_cacheflush_memprereq=0 +80_indexing_busysleep=10 +80_indexing_memprereq=12582912 82_crawlstack_idlesleep=5000 82_crawlstack_busysleep=1 82_crawlstack_memprereq=1048576 diff --git a/defaults/yacy.init b/defaults/yacy.init index 1f5de72de..00362ccc1 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -569,35 +569,37 @@ filterOutStopwordsFromTopwords=true # and another idlesleep is performed 20_dhtdistribution_idlesleep=30000 20_dhtdistribution_busysleep=10000 -20_dhtdistribution_memprereq=6291456 +20_dhtdistribution_memprereq=12582912 30_peerping_idlesleep=120000 30_peerping_busysleep=120000 -30_peerping_memprereq=1048576 +30_peerping_memprereq=2097152 40_peerseedcycle_idlesleep=1800000 40_peerseedcycle_busysleep=1200000 -40_peerseedcycle_memprereq=2097152 +40_peerseedcycle_memprereq=4194304 50_localcrawl_idlesleep=2000 50_localcrawl_busysleep=50 -50_localcrawl_memprereq=4194304 +50_localcrawl_memprereq=12582912 50_localcrawl_isPaused=false 60_remotecrawlloader_idlesleep=60000 -60_remotecrawlloader_busysleep=30000 -60_remotecrawlloader_memprereq=2097152 +60_remotecrawlloader_busysleep=10000 +60_remotecrawlloader_memprereq=12582912 60_remotecrawlloader_isPaused=false -62_remotetriggeredcrawl_idlesleep=30000 -62_remotetriggeredcrawl_busysleep=10000 -62_remotetriggeredcrawl_memprereq=6291456 +62_remotetriggeredcrawl_idlesleep=10000 +62_remotetriggeredcrawl_busysleep=1000 +62_remotetriggeredcrawl_memprereq=12582912 62_remotetriggeredcrawl_isPaused=false 80_indexing_idlesleep=1000 80_indexing_busysleep=10 -80_indexing_memprereq=6291456 -85_cacheflush_idlesleep=60000 -85_cacheflush_busysleep=10000 -85_cacheflush_memprereq=0 +80_indexing_memprereq=12582912 90_cleanup_idlesleep=300000 90_cleanup_busysleep=300000 90_cleanup_memprereq=0 +# autoReCrawl Options +autoReCrawl_idlesleep = 3600000 +autoReCrawl_busysleep = 3600000 +autoReCrawl_memprereq = -1 + # additional attributes: # performanceIO is a percent-value. a value of 10 means, that 10% of the busysleep time # is used to flush the RAM cache, which is the major part of the IO in YaCy @@ -886,10 +888,6 @@ routing.deleteOldSeeds.permission__pro = true routing.deleteOldSeeds.time = 7 routing.deleteOldSeeds.time__pro = 30 -# autoReCrawl Options -autoReCrawl_idlesleep = 3600000 -autoReCrawl_busysleep = 3600000 -autoReCrawl_memprereq = -1 # options to remember the default search engines when using the search compare features compare_yacy.left = YaCy @@ -904,6 +902,3 @@ cgi.suffixes = cgi,pl # whether this is a version for a web browser browserintegration = false - -# next index data structure -useCell = false \ No newline at end of file diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index e40beb9b7..9c1c0dcd0 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -533,7 +533,8 @@ public class Balancer { // 3rd: take entry from file if ((result == null) && (urlFileStack.size() > 0)) { - final Row.Entry nextentry = (top) ? urlFileStack.top() : urlFileStack.bot(); + Row.Entry nextentry = (top) ? urlFileStack.top() : urlFileStack.bot(); + if (nextentry == null) nextentry = (top) ? urlFileStack.bot() : urlFileStack.top(); if (nextentry == null) { // emergency case: this means that something with the stack organization is wrong // the file appears to be broken. We kill the file. diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index c28dc392a..aaaa12726 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -95,9 +95,15 @@ public class IndexingStack { } public synchronized QueueEntry pop() throws IOException { - if (sbQueueStack.size() == 0) return null; + if (sbQueueStack.size() == 0) { + Log.logInfo("IndexingStack", "sbQueueStack.size() == 0"); + return null; + } final Row.Entry b = sbQueueStack.pot(); - if (b == null) return null; + if (b == null) { + Log.logInfo("IndexingStack", "sbQueueStack.pot() == null"); + return null; + } return new QueueEntry(b); } diff --git a/source/de/anomic/crawler/NoticedURL.java b/source/de/anomic/crawler/NoticedURL.java index e92eeaebb..70773a638 100755 --- a/source/de/anomic/crawler/NoticedURL.java +++ b/source/de/anomic/crawler/NoticedURL.java @@ -72,6 +72,7 @@ public class NoticedURL { } public void clear() { + Log.logInfo("NoticedURL", "clearing all stacks"); coreStack.clear(); limitStack.clear(); remoteStack.clear(); @@ -207,6 +208,7 @@ public class NoticedURL { } public void clear(final int stackType) { + Log.logInfo("NoticedURL", "clearing stack " + stackType); switch (stackType) { case STACK_TYPE_CORE: coreStack.clear(); break; case STACK_TYPE_LIMIT: limitStack.clear(); break; diff --git a/source/de/anomic/kelondro/table/Stack.java b/source/de/anomic/kelondro/table/Stack.java index e0f91b68e..b2e0f9524 100644 --- a/source/de/anomic/kelondro/table/Stack.java +++ b/source/de/anomic/kelondro/table/Stack.java @@ -37,6 +37,7 @@ import java.util.StringTokenizer; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.order.NaturalOrder; import de.anomic.kelondro.util.FileUtils; +import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.kelondroException; public final class Stack extends FullRecords { @@ -184,7 +185,10 @@ public final class Stack extends FullRecords { public synchronized Row.Entry pot() throws IOException { // return row on the bottom of the stack and remove record final Node n = botNode(); - if (n == null) return null; + if (n == null) { + Log.logInfo("Stack", "botNode() == null"); + return null; + } final Row.Entry ret = row().newEntry(n.getValueRow()); // remove node @@ -239,9 +243,15 @@ public final class Stack extends FullRecords { private Node botNode() throws IOException { // return node on bottom of the stack - if (size() == 0) return null; + if (size() == 0) { + Log.logInfo("Stack", "size() == 0"); + return null; + } final RecordHandle h = getHandle(root); - if (h == null) return null; + if (h == null) { + Log.logInfo("Stack", "getHandle(root) == null"); + return null; + } return new EcoNode(h); } diff --git a/source/de/anomic/kelondro/text/BufferedIndexCollection.java b/source/de/anomic/kelondro/text/BufferedIndexCollection.java deleted file mode 100644 index 4ed8b796f..000000000 --- a/source/de/anomic/kelondro/text/BufferedIndexCollection.java +++ /dev/null @@ -1,344 +0,0 @@ -// BufferedIndexCollection.java -// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2005 on http://www.anomic.de -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $ -// $LastChangedRevision: 5709 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro.text; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Set; - -import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.index.RowCollection; -import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.order.ByteOrder; -import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.order.MergeIterator; -import de.anomic.kelondro.order.Order; -import de.anomic.kelondro.order.RotateIterator; -import de.anomic.kelondro.text.Index; -import de.anomic.kelondro.text.IndexBuffer; -import de.anomic.kelondro.text.IndexCollection; -import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceContainerOrder; -import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; -import de.anomic.kelondro.util.FileUtils; -import de.anomic.kelondro.util.MemoryControl; -import de.anomic.kelondro.util.Log; -import de.anomic.server.serverProfiling; - -public final class BufferedIndexCollection extends AbstractBufferedIndex implements Index, BufferedIndex { - - // environment constants - public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes - public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash - public static final int lowcachedivisor = 900; - public static final int maxCollectionPartition = 7; // should be 7 - - private final IndexBuffer buffer; - private final IndexCollection collections; - - public BufferedIndexCollection ( - File indexPrimaryTextLocation, - final ReferenceFactory factory, - final ByteOrder wordOrdering, - final Row payloadrow, - final int entityCacheMaxSize, - final boolean useCommons, - final int redundancy, - Log log) throws IOException { - super(factory); - - final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE"); - if (!(textindexcache.exists())) textindexcache.mkdirs(); - if (new File(textindexcache, "index.dhtin.blob").exists()) { - // migration of the both caches into one - this.buffer = new IndexBuffer(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); - IndexBuffer dhtInCache = new IndexBuffer(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log); - for (ReferenceContainer c: dhtInCache) { - this.buffer.add(c); - } - FileUtils.deletedelete(new File(textindexcache, "index.dhtin.blob")); - } else { - // read in new BLOB - this.buffer = new IndexBuffer(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log); - } - - // create collections storage path - final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION"); - if (!(textindexcollections.exists())) textindexcollections.mkdirs(); - this.collections = new IndexCollection( - textindexcollections, - "collection", - factory, - 12, - Base64Order.enhancedCoder, - maxCollectionPartition, - WordReferenceRow.urlEntryRow, - useCommons); - } - - /* methods for interface Index */ - - public void add(final ReferenceContainer entries) { - assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize); - - // add the entry - buffer.add(entries); - cacheFlushControl(); - } - - public void add(final byte[] wordHash, final ReferenceType entry) throws IOException { - // add the entry - buffer.add(wordHash, entry); - cacheFlushControl(); - } - - public boolean has(final byte[] wordHash) { - if (buffer.has(wordHash)) return true; - if (collections.has(wordHash)) return true; - return false; - } - - public int count(byte[] key) { - return buffer.count(key) + collections.count(key); - } - - public ReferenceContainer get(final byte[] wordHash, final Set urlselection) { - if (wordHash == null) { - // wrong input - return null; - } - - // get from cache - ReferenceContainer container; - container = buffer.get(wordHash, urlselection); - - // get from collection index - if (container == null) { - container = collections.get(wordHash, urlselection); - } else { - container.addAllUnique(collections.get(wordHash, urlselection)); - } - - if (container == null) return null; - - // check doubles - final int beforeDouble = container.size(); - container.sort(); - final ArrayList d = container.removeDoubles(); - RowCollection set; - for (int i = 0; i < d.size(); i++) { - // for each element in the double-set, take that one that is the most recent one - set = d.get(i); - WordReferenceRow e, elm = null; - long lm = 0; - for (int j = 0; j < set.size(); j++) { - e = new WordReferenceRow(set.get(j, true)); - if ((elm == null) || (e.lastModified() > lm)) { - elm = e; - lm = e.lastModified(); - } - } - if(elm != null) { - container.addUnique(elm.toKelondroEntry()); - } - } - if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHashAsString()); - - return container; - } - - public ReferenceContainer delete(final byte[] wordHash) { - final ReferenceContainer c = new ReferenceContainer( - factory, - wordHash, - WordReferenceRow.urlEntryRow, - buffer.count(wordHash)); - c.addAllUnique(buffer.delete(wordHash)); - c.addAllUnique(collections.delete(wordHash)); - return c; - } - - public boolean remove(final byte[] wordHash, final String urlHash) { - boolean removed = false; - removed = removed | (buffer.remove(wordHash, urlHash)); - removed = removed | (collections.remove(wordHash, urlHash)); - return removed; - } - - public int remove(final byte[] wordHash, final Set urlHashes) { - int removed = 0; - removed += buffer.remove(wordHash, urlHashes); - removed += collections.remove(wordHash, urlHashes); - return removed; - } - - public synchronized CloneableIterator> references(final byte[] startHash, final boolean rot, final boolean ram) throws IOException { - final CloneableIterator> i = wordContainers(startHash, ram); - if (rot) { - return new RotateIterator>(i, Base64Order.zero(startHash.length), buffer.size() + ((ram) ? 0 : collections.size())); - } - return i; - } - - private synchronized CloneableIterator> wordContainers(final byte[] startWordHash, final boolean ram) throws IOException { - final Order> containerOrder = new ReferenceContainerOrder(factory, buffer.ordering().clone()); - ReferenceContainer emptyContainer = ReferenceContainer.emptyContainer(factory, startWordHash, 0); - containerOrder.rotate(emptyContainer); - if (ram) { - return buffer.references(startWordHash, false); - } - return collections.references(startWordHash, false); - /* - return new MergeIterator( - indexCache.referenceIterator(startWordHash, false, true), - collections.referenceIterator(startWordHash, false, false), - containerOrder, - ReferenceContainer.containerMergeMethod, - true); - */ - } - - public void clear() { - buffer.clear(); - try { - collections.clear(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void close() { - buffer.close(); - collections.close(); - } - - public int size() { - return java.lang.Math.max(collections.size(), buffer.size()); - } - - public int minMem() { - return 1024*1024 /* indexing overhead */ + buffer.minMem() + collections.minMem(); - } - - - /* - * methods for cache management - */ - - public int getBufferMaxReferences() { - return buffer.getBufferMaxReferences(); - } - - public long getBufferMinAge() { - return buffer.getBufferMinAge(); - } - - public long getBufferMaxAge() { - return buffer.getBufferMaxAge(); - } - - public long getBufferSizeBytes() { - return buffer.getBufferSizeBytes(); - } - - public void setBufferMaxWordCount(final int maxWords) { - buffer.setMaxWordCount(maxWords); - } - - private void cacheFlushControl() { - // check for forced flush - int cs = getBufferSize(); - if (cs > 0) { - // flush elements that are too big. This flushing depends on the fact that the flush rule - // selects the biggest elements first for flushing. If it does not for any reason, the following - // loop would not terminate. - serverProfiling.update("wordcache", Long.valueOf(cs), true); - // To ensure termination an additional counter is used - int l = 0; - while (this.buffer.size() > 0 && (l++ < 100) && (this.buffer.getBufferMaxReferences() > wCacheMaxChunk)) { - flushCacheOne(this.buffer); - } - // next flush more entries if the size exceeds the maximum size of the cache - while (this.buffer.size() > 0 && - ((this.buffer.size() > this.buffer.getMaxWordCount()) || - (MemoryControl.available() < collections.minMem()))) { - flushCacheOne(this.buffer); - } - if (getBufferSize() != cs) serverProfiling.update("wordcache", Long.valueOf(getBufferSize()), true); - } - } - - public void cleanupBuffer(int time) { - flushCacheUntil(System.currentTimeMillis() + time); - } - - private synchronized void flushCacheUntil(long timeout) { - while (System.currentTimeMillis() < timeout && buffer.size() > 0) { - flushCacheOne(buffer); - } - } - - private synchronized void flushCacheOne(final IndexBuffer ram) { - if (ram.size() > 0) collections.add(flushContainer(ram)); - } - - private ReferenceContainer flushContainer(final IndexBuffer ram) { - byte[] wordHash; - ReferenceContainer c; - wordHash = ram.maxScoreWordHash(); - c = ram.get(wordHash, null); - if ((c != null) && (c.size() > wCacheMaxChunk)) { - return ram.delete(wordHash); - } else { - return ram.delete(ram.bestFlushWordHash()); - } - } - - public int getBackendSize() { - return collections.size(); - } - - public int getBufferSize() { - return buffer.size(); - } - - public ByteOrder ordering() { - return collections.ordering(); - } - - public CloneableIterator> references(byte[] startWordHash, boolean rot) { - final Order> containerOrder = new ReferenceContainerOrder(factory, this.buffer.ordering().clone()); - return new MergeIterator>( - this.buffer.references(startWordHash, false), - this.collections.references(startWordHash, false), - containerOrder, - ReferenceContainer.containerMergeMethod, - true); - } - -} diff --git a/source/de/anomic/kelondro/text/IndexBuffer.java b/source/de/anomic/kelondro/text/IndexBuffer.java deleted file mode 100644 index d7f76c536..000000000 --- a/source/de/anomic/kelondro/text/IndexBuffer.java +++ /dev/null @@ -1,348 +0,0 @@ -// IndexCache.java -// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 2005 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro.text; - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.Set; - -import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.order.ByteOrder; -import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; -import de.anomic.kelondro.util.MemoryControl; -import de.anomic.kelondro.util.ScoreCluster; -import de.anomic.kelondro.util.Log; - -/** - * A IndexCache is a ReferenceContainerCache with an attached cache flush logic - * - */ -public final class IndexBuffer extends AbstractIndex implements Index, IndexReader, Iterable> { - - // class variables - private final ScoreCluster hashScore; - private final ScoreCluster hashDate; - private long initTime; - private int cacheEntityMaxCount; // the maximum number of cache slots for RWI entries - public int cacheReferenceCountLimit; // the maximum number of references to a single RWI entity - public long cacheReferenceAgeLimit; // the maximum age (= time not changed) of a RWI entity - private final Log log; - private final File dumpFile; - private ReferenceContainerCache heap; - - @SuppressWarnings("unchecked") - public IndexBuffer( - final File databaseRoot, - final ReferenceFactory factory, - final ByteOrder wordOrdering, - final Row payloadrow, - final int entityCacheMaxSize, - final int wCacheReferenceCountLimitInit, - final long wCacheReferenceAgeLimitInit, - final String newHeapName, - final Log log) { - super(factory); - - // creates a new index cache - // the cache has a back-end where indexes that do not fit in the cache are flushed - this.hashScore = new ScoreCluster(Base64Order.enhancedCoder); - this.hashDate = new ScoreCluster(Base64Order.enhancedCoder); - this.initTime = System.currentTimeMillis(); - this.cacheEntityMaxCount = entityCacheMaxSize; - this.cacheReferenceCountLimit = wCacheReferenceCountLimitInit; - this.cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit; - this.log = log; - this.dumpFile = new File(databaseRoot, newHeapName); - this.heap = new ReferenceContainerCache(factory, payloadrow, wordOrdering); - - // read in dump of last session - boolean initFailed = false; - if (dumpFile.exists()) try { - heap.initWriteModeFromBLOB(dumpFile); - } catch (IOException e) { - initFailed = true; - e.printStackTrace(); - } - if (initFailed) { - log.logSevere("unable to restore cache dump"); - // get empty dump - heap.initWriteMode(); - } else if (dumpFile.exists()) { - // initialize scores for cache organization - for (final ReferenceContainer ic : (Iterable) heap.references(null, false)) { - this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote())); - this.hashScore.setScore(ic.getTermHash(), ic.size()); - } - } else { - heap.initWriteMode(); - } - } - - /** - * clear the content - * @throws IOException - */ - public void clear() { - hashScore.clear(); - hashDate.clear(); - initTime = System.currentTimeMillis(); - heap.clear(); - } - - public int minMem() { - // there is no specific large array that needs to be maintained - // this value is just a guess of the possible overhead - return 100 * 1024; // 100 kb - } - - // cache settings - public int getBufferMaxReferences() { - if (hashScore.size() == 0) return 0; - return hashScore.getMaxScore(); - } - - public long getBufferMinAge() { - if (hashDate.size() == 0) return 0; - return System.currentTimeMillis() - longEmit(hashDate.getMaxScore()); - } - - public long getBufferMaxAge() { - if (hashDate.size() == 0) return 0; - return System.currentTimeMillis() - longEmit(hashDate.getMinScore()); - } - - public void setMaxWordCount(final int maxWords) { - this.cacheEntityMaxCount = maxWords; - } - - public int getMaxWordCount() { - return this.cacheEntityMaxCount; - } - - public int size() { - if (heap == null) return 0; - return heap.size(); - } - - public synchronized CloneableIterator> references(final byte[] startWordHash, final boolean rot) { - // we return an iterator object that creates top-level-clones of the indexContainers - // in the cache, so that manipulations of the iterated objects do not change - // objects in the cache. - return heap.references(startWordHash, rot); - } - - public synchronized byte[] maxScoreWordHash() { - if (heap == null || heap.size() == 0) return null; - try { - return hashScore.getMaxObject(); - } catch (final Exception e) { - log.logSevere("flushFromMem: " + e.getMessage(), e); - } - return null; - } - - public byte[] bestFlushWordHash() { - // select appropriate hash - // we have 2 different methods to find a good hash: - // - the oldest entry in the cache - // - the entry with maximum count - if (heap == null || heap.size() == 0) return null; - try { - //return hashScore.getMaxObject(); - byte[] hash = null; - final int count = hashScore.getMaxScore(); - if ((count >= cacheReferenceCountLimit) && - ((hash = hashScore.getMaxObject()) != null)) { - // we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails - // in this cache we MUST NOT check wCacheMinAge - return hash; - } - final long oldestTime = longEmit(hashDate.getMinScore()); - if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit) && - ((hash = hashDate.getMinObject()) != null)) { - // flush out-dated entries - return hash; - } - // cases with respect to memory situation - if (MemoryControl.free() < 100000) { - // urgent low-memory case - hash = hashScore.getMaxObject(); // flush high-score entries (saves RAM) - } else { - // not-efficient-so-far case. cleans up unnecessary cache slots - hash = hashDate.getMinObject(); // flush oldest entries - } - if (hash == null) { - final ReferenceContainer ic = heap.references(null, false).next(); - if (ic != null) hash = ic.getTermHash(); - } - return hash; - - } catch (final Exception e) { - log.logSevere("flushFromMem: " + e.getMessage(), e); - } - return null; - } - - public synchronized ArrayList> bestFlushContainers(final int count) { - final ArrayList> containerList = new ArrayList>(); - byte[] hash; - ReferenceContainer container; - for (int i = 0; i < count; i++) { - hash = bestFlushWordHash(); - if (hash == null) return containerList; - container = heap.delete(hash); - assert (container != null); - if (container == null) return containerList; - hashScore.deleteScore(hash); - hashDate.deleteScore(hash); - containerList.add(container); - } - return containerList; - } - - private int intTime(final long longTime) { - return (int) Math.max(0, ((longTime - initTime) / 1000)); - } - - private long longEmit(final int intTime) { - return (((long) intTime) * (long) 1000) + initTime; - } - - public boolean has(final byte[] wordHash) { - return heap.has(wordHash); - } - - public int count(byte[] key) { - return this.heap.count(key); - } - - public synchronized ReferenceContainer get(final byte[] wordHash, final Set urlselection) { - if (wordHash == null) return null; - - // retrieve container - ReferenceContainer container = heap.get(wordHash, null); - - // We must not use the container from cache to store everything we find, - // as that container remains linked to in the cache and might be changed later - // while the returned container is still in use. - // create a clone from the container - if (container != null) container = container.topLevelClone(); - - // select the urlselection - if ((urlselection != null) && (container != null)) container.select(urlselection); - - return container; - } - - public synchronized ReferenceContainer delete(final byte[] wordHash) { - // returns the index that had been deleted - if (wordHash == null || heap == null) return null; - final ReferenceContainer container = heap.delete(wordHash); - hashScore.deleteScore(wordHash); - hashDate.deleteScore(wordHash); - return container; - } - - public synchronized boolean remove(final byte[] wordHash, final String urlHash) { - final boolean removed = heap.remove(wordHash, urlHash); - if (removed) { - if (heap.has(wordHash)) { - hashScore.decScore(wordHash); - hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); - } else { - hashScore.deleteScore(wordHash); - hashDate.deleteScore(wordHash); - } - return true; - } - return false; - } - - public synchronized int remove(final byte[] wordHash, final Set urlHashes) { - if (urlHashes.size() == 0) return 0; - final int c = heap.remove(wordHash, urlHashes); - if (c > 0) { - // removal successful - if (heap.has(wordHash)) { - hashScore.addScore(wordHash, -c); - hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); - } else { - hashScore.deleteScore(wordHash); - hashDate.deleteScore(wordHash); - } - return c; - } - return 0; - } - - public synchronized void add(final ReferenceContainer container) { - if (container == null || container.size() == 0 || heap == null) return; - - // put new words into cache - heap.add(container); - hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash())); - hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis())); - } - - public void add(final byte[] wordHash, final ReferenceType entry) throws IOException { - if (entry == null || heap == null) return; - - // put new words into cache - heap.add(wordHash, entry); - hashScore.incScore(wordHash); - hashDate.setScore(wordHash, intTime(System.currentTimeMillis())); - } - - public synchronized void close() { - heap.dump(this.dumpFile); - heap = null; - hashScore.clear(); - hashDate.clear(); - } - - public Iterator> iterator() { - return references(null, false); - } - - public ByteOrder ordering() { - return heap.ordering(); - } - - public synchronized long getBufferSizeBytes() { - // calculate the real size in bytes of the index cache - long cacheBytes = 0; - final long entryBytes = WordReferenceRow.urlEntryRow.objectsize; - final Iterator> it = references(null, false); - while (it.hasNext()) cacheBytes += it.next().size() * entryBytes; - return cacheBytes; - } - -} diff --git a/source/de/anomic/kelondro/text/IndexCollection.java b/source/de/anomic/kelondro/text/IndexCollection.java index f8de3eb8e..5dbadc51c 100644 --- a/source/de/anomic/kelondro/text/IndexCollection.java +++ b/source/de/anomic/kelondro/text/IndexCollection.java @@ -60,7 +60,7 @@ import de.anomic.kelondro.util.kelondroOutOfLimitsException; import de.anomic.kelondro.util.Log; import de.anomic.yacy.yacyURL; -public class IndexCollection extends AbstractIndex implements Index { +public class IndexCollection extends AbstractIndex { private static final int loadfactor = 4; private static final int serialNumber = 0; diff --git a/source/de/anomic/kelondro/text/IndexCollectionMigration.java b/source/de/anomic/kelondro/text/IndexCollectionMigration.java index 58034a772..f34282a9d 100644 --- a/source/de/anomic/kelondro/text/IndexCollectionMigration.java +++ b/source/de/anomic/kelondro/text/IndexCollectionMigration.java @@ -107,7 +107,7 @@ public final class IndexCollectionMigration ext factory, 12, Base64Order.enhancedCoder, - BufferedIndexCollection.maxCollectionPartition, + 7, WordReferenceRow.urlEntryRow, false); if (this.collections.size() == 0) { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 1bb67a93c..db903ea16 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -321,7 +321,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch(plasmaWordIndex.wordReferenceFactory, 1, 1) : null; + this.merger = new IODispatcher(plasmaWordIndex.wordReferenceFactory, 1, 1); if (this.merger != null) this.merger.start(); - this.index = (useCell) ? - new IndexCollectionMigration( + this.index = new IndexCollectionMigration( indexPrimaryTextLocation, wordReferenceFactory, wordOrder, @@ -166,17 +163,7 @@ public final class plasmaWordIndex { targetFileSize, maxFileSize, this.merger, - log) - : - new BufferedIndexCollection( - indexPrimaryTextLocation, - wordReferenceFactory, - wordOrder, - WordReferenceRow.urlEntryRow, - entityCacheMaxSize, - useCommons, - redundancy, - log); + log); } else { this.merger = new IODispatcher(plasmaWordIndex.wordReferenceFactory, 1, 1); this.merger.start(); diff --git a/source/de/anomic/yacy/dht/Dispatcher.java b/source/de/anomic/yacy/dht/Dispatcher.java index 2e340e55e..71966cecd 100755 --- a/source/de/anomic/yacy/dht/Dispatcher.java +++ b/source/de/anomic/yacy/dht/Dispatcher.java @@ -328,6 +328,7 @@ public class Dispatcher { * This method returns true if a container was dequeued, false if not */ public boolean dequeueContainer() { + if (transmissionCloud == null) return false; if (this.indexingTransmissionProcessor.queueSize() > indexingTransmissionProcessor.concurrency()) return false; byte[] maxtarget = null; int maxsize = -1; diff --git a/source/yacy.java b/source/yacy.java index 83e1362eb..bac79ff01 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -675,7 +675,7 @@ public final class yacy { final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total()); if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up."); - final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0, false); + final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); final Iterator> indexContainerIterator = wordIndex.index().references("AAAAAAAAAAAA".getBytes(), false, false); long urlCounter = 0, wordCounter = 0; @@ -866,7 +866,7 @@ public final class yacy { try { Iterator> indexContainerIterator = null; if (resource.equals("all")) { - WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0, false); + WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0); indexContainerIterator = WordIndex.index().references(wordChunkStartHash.getBytes(), false, false); } int counter = 0;