- removed useCell option: the indexCell data structure is now the default index structure; old collection data is still migrated

- added some debugging output to balancer to find a bug
- removed unused classes for index collection handling
- changed some default values for the process handling: more memory needed to prevent OOM

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5856 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 1b9e532c87
commit 138422990a

@ -3,7 +3,7 @@ javacSource=1.5
javacTarget=1.5
# Release Configuration
releaseVersion=0.75
releaseVersion=0.76
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
embReleaseFile=yacy_emb_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
proReleaseFile=yacy_pro_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz

@ -4,33 +4,31 @@
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the
# the idlesleep is the pause that an process sleeps if the last call to the
# process job was without execution of anything;
# the busysleep is the pause after a full job execution
# the prereq-value is a memory pre-requisite: that much bytes must
# be available/free in the heap; othervise the loop is not executed
# and another idlesleep is performed
20_dhtdistribution_idlesleep=5000
20_dhtdistribution_busysleep=2000
20_dhtdistribution_memprereq=6291456
20_dhtdistribution_memprereq=12582912
50_localcrawl_idlesleep=4000
50_localcrawl_busysleep=500
50_localcrawl_memprereq=4194304
50_localcrawl_busysleep=50
50_localcrawl_memprereq=12582912
50_localcrawl_isPaused=false
60_remotecrawlloader_idlesleep=60000
60_remotecrawlloader_busysleep=40000
60_remotecrawlloader_memprereq=2097152
60_remotecrawlloader_idlesleep=120000
60_remotecrawlloader_busysleep=60000
60_remotecrawlloader_memprereq=12582912
60_remotecrawlloader_isPaused=false
62_remotetriggeredcrawl_idlesleep=10000
62_remotetriggeredcrawl_busysleep=1000
62_remotetriggeredcrawl_memprereq=6291456
62_remotetriggeredcrawl_idlesleep=60000
62_remotetriggeredcrawl_busysleep=10000
62_remotetriggeredcrawl_memprereq=12582912
62_remotetriggeredcrawl_isPaused=false
80_indexing_idlesleep=1000
80_indexing_busysleep=100
80_indexing_memprereq=6291456
85_cacheflush_idlesleep=120000
85_cacheflush_busysleep=60000
85_cacheflush_memprereq=0
80_indexing_busysleep=10
80_indexing_memprereq=12582912
82_crawlstack_idlesleep=5000
82_crawlstack_busysleep=1
82_crawlstack_memprereq=1048576

@ -569,35 +569,37 @@ filterOutStopwordsFromTopwords=true
# and another idlesleep is performed
20_dhtdistribution_idlesleep=30000
20_dhtdistribution_busysleep=10000
20_dhtdistribution_memprereq=6291456
20_dhtdistribution_memprereq=12582912
30_peerping_idlesleep=120000
30_peerping_busysleep=120000
30_peerping_memprereq=1048576
30_peerping_memprereq=2097152
40_peerseedcycle_idlesleep=1800000
40_peerseedcycle_busysleep=1200000
40_peerseedcycle_memprereq=2097152
40_peerseedcycle_memprereq=4194304
50_localcrawl_idlesleep=2000
50_localcrawl_busysleep=50
50_localcrawl_memprereq=4194304
50_localcrawl_memprereq=12582912
50_localcrawl_isPaused=false
60_remotecrawlloader_idlesleep=60000
60_remotecrawlloader_busysleep=30000
60_remotecrawlloader_memprereq=2097152
60_remotecrawlloader_busysleep=10000
60_remotecrawlloader_memprereq=12582912
60_remotecrawlloader_isPaused=false
62_remotetriggeredcrawl_idlesleep=30000
62_remotetriggeredcrawl_busysleep=10000
62_remotetriggeredcrawl_memprereq=6291456
62_remotetriggeredcrawl_idlesleep=10000
62_remotetriggeredcrawl_busysleep=1000
62_remotetriggeredcrawl_memprereq=12582912
62_remotetriggeredcrawl_isPaused=false
80_indexing_idlesleep=1000
80_indexing_busysleep=10
80_indexing_memprereq=6291456
85_cacheflush_idlesleep=60000
85_cacheflush_busysleep=10000
85_cacheflush_memprereq=0
80_indexing_memprereq=12582912
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
# autoReCrawl Options
autoReCrawl_idlesleep = 3600000
autoReCrawl_busysleep = 3600000
autoReCrawl_memprereq = -1
# additional attributes:
# performanceIO is a percent-value. a value of 10 means, that 10% of the busysleep time
# is used to flush the RAM cache, which is the major part of the IO in YaCy
@ -886,10 +888,6 @@ routing.deleteOldSeeds.permission__pro = true
routing.deleteOldSeeds.time = 7
routing.deleteOldSeeds.time__pro = 30
# autoReCrawl Options
autoReCrawl_idlesleep = 3600000
autoReCrawl_busysleep = 3600000
autoReCrawl_memprereq = -1
# options to remember the default search engines when using the search compare features
compare_yacy.left = YaCy
@ -904,6 +902,3 @@ cgi.suffixes = cgi,pl
# whether this is a version for a web browser
browserintegration = false
# next index data structure
useCell = false

@ -533,7 +533,8 @@ public class Balancer {
// 3rd: take entry from file
if ((result == null) && (urlFileStack.size() > 0)) {
final Row.Entry nextentry = (top) ? urlFileStack.top() : urlFileStack.bot();
Row.Entry nextentry = (top) ? urlFileStack.top() : urlFileStack.bot();
if (nextentry == null) nextentry = (top) ? urlFileStack.bot() : urlFileStack.top();
if (nextentry == null) {
// emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file.

@ -95,9 +95,15 @@ public class IndexingStack {
}
public synchronized QueueEntry pop() throws IOException {
if (sbQueueStack.size() == 0) return null;
if (sbQueueStack.size() == 0) {
Log.logInfo("IndexingStack", "sbQueueStack.size() == 0");
return null;
}
final Row.Entry b = sbQueueStack.pot();
if (b == null) return null;
if (b == null) {
Log.logInfo("IndexingStack", "sbQueueStack.pot() == null");
return null;
}
return new QueueEntry(b);
}

@ -72,6 +72,7 @@ public class NoticedURL {
}
public void clear() {
Log.logInfo("NoticedURL", "clearing all stacks");
coreStack.clear();
limitStack.clear();
remoteStack.clear();
@ -207,6 +208,7 @@ public class NoticedURL {
}
public void clear(final int stackType) {
Log.logInfo("NoticedURL", "clearing stack " + stackType);
switch (stackType) {
case STACK_TYPE_CORE: coreStack.clear(); break;
case STACK_TYPE_LIMIT: limitStack.clear(); break;

@ -37,6 +37,7 @@ import java.util.StringTokenizer;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.kelondroException;
public final class Stack extends FullRecords {
@ -184,7 +185,10 @@ public final class Stack extends FullRecords {
public synchronized Row.Entry pot() throws IOException {
// return row on the bottom of the stack and remove record
final Node n = botNode();
if (n == null) return null;
if (n == null) {
Log.logInfo("Stack", "botNode() == null");
return null;
}
final Row.Entry ret = row().newEntry(n.getValueRow());
// remove node
@ -239,9 +243,15 @@ public final class Stack extends FullRecords {
private Node botNode() throws IOException {
// return node on bottom of the stack
if (size() == 0) return null;
if (size() == 0) {
Log.logInfo("Stack", "size() == 0");
return null;
}
final RecordHandle h = getHandle(root);
if (h == null) return null;
if (h == null) {
Log.logInfo("Stack", "getHandle(root) == null");
return null;
}
return new EcoNode(h);
}

@ -1,344 +0,0 @@
// BufferedIndexCollection.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexBuffer;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
public final class BufferedIndexCollection<ReferenceType extends Reference> extends AbstractBufferedIndex<ReferenceType> implements Index<ReferenceType>, BufferedIndex<ReferenceType> {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private final IndexBuffer<ReferenceType> buffer;
private final IndexCollection<ReferenceType> collections;
public BufferedIndexCollection (
File indexPrimaryTextLocation,
final ReferenceFactory<ReferenceType> factory,
final ByteOrder wordOrdering,
final Row payloadrow,
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
Log log) throws IOException {
super(factory);
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.buffer = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexBuffer<ReferenceType> dhtInCache = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer<ReferenceType> c: dhtInCache) {
this.buffer.add(c);
}
FileUtils.deletedelete(new File(textindexcache, "index.dhtin.blob"));
} else {
// read in new BLOB
this.buffer = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
}
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection<ReferenceType>(
textindexcollections,
"collection",
factory,
12,
Base64Order.enhancedCoder,
maxCollectionPartition,
WordReferenceRow.urlEntryRow,
useCommons);
}
/* methods for interface Index */
public void add(final ReferenceContainer<ReferenceType> entries) {
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
// add the entry
buffer.add(entries);
cacheFlushControl();
}
public void add(final byte[] wordHash, final ReferenceType entry) throws IOException {
// add the entry
buffer.add(wordHash, entry);
cacheFlushControl();
}
public boolean has(final byte[] wordHash) {
if (buffer.has(wordHash)) return true;
if (collections.has(wordHash)) return true;
return false;
}
public int count(byte[] key) {
return buffer.count(key) + collections.count(key);
}
public ReferenceContainer<ReferenceType> get(final byte[] wordHash, final Set<String> urlselection) {
if (wordHash == null) {
// wrong input
return null;
}
// get from cache
ReferenceContainer<ReferenceType> container;
container = buffer.get(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.get(wordHash, urlselection);
} else {
container.addAllUnique(collections.get(wordHash, urlselection));
}
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
container.sort();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
WordReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new WordReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
}
}
if(elm != null) {
container.addUnique(elm.toKelondroEntry());
}
}
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHashAsString());
return container;
}
public ReferenceContainer<ReferenceType> delete(final byte[] wordHash) {
final ReferenceContainer<ReferenceType> c = new ReferenceContainer<ReferenceType>(
factory,
wordHash,
WordReferenceRow.urlEntryRow,
buffer.count(wordHash));
c.addAllUnique(buffer.delete(wordHash));
c.addAllUnique(collections.delete(wordHash));
return c;
}
public boolean remove(final byte[] wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (buffer.remove(wordHash, urlHash));
removed = removed | (collections.remove(wordHash, urlHash));
return removed;
}
public int remove(final byte[] wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += buffer.remove(wordHash, urlHashes);
removed += collections.remove(wordHash, urlHashes);
return removed;
}
public synchronized CloneableIterator<ReferenceContainer<ReferenceType>> references(final byte[] startHash, final boolean rot, final boolean ram) throws IOException {
final CloneableIterator<ReferenceContainer<ReferenceType>> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer<ReferenceType>>(i, Base64Order.zero(startHash.length), buffer.size() + ((ram) ? 0 : collections.size()));
}
return i;
}
private synchronized CloneableIterator<ReferenceContainer<ReferenceType>> wordContainers(final byte[] startWordHash, final boolean ram) throws IOException {
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, buffer.ordering().clone());
ReferenceContainer<ReferenceType> emptyContainer = ReferenceContainer.emptyContainer(factory, startWordHash, 0);
containerOrder.rotate(emptyContainer);
if (ram) {
return buffer.references(startWordHash, false);
}
return collections.references(startWordHash, false);
/*
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
*/
}
public void clear() {
buffer.clear();
try {
collections.clear();
} catch (IOException e) {
e.printStackTrace();
}
}
public void close() {
buffer.close();
collections.close();
}
public int size() {
return java.lang.Math.max(collections.size(), buffer.size());
}
public int minMem() {
return 1024*1024 /* indexing overhead */ + buffer.minMem() + collections.minMem();
}
/*
* methods for cache management
*/
public int getBufferMaxReferences() {
return buffer.getBufferMaxReferences();
}
public long getBufferMinAge() {
return buffer.getBufferMinAge();
}
public long getBufferMaxAge() {
return buffer.getBufferMaxAge();
}
public long getBufferSizeBytes() {
return buffer.getBufferSizeBytes();
}
public void setBufferMaxWordCount(final int maxWords) {
buffer.setMaxWordCount(maxWords);
}
private void cacheFlushControl() {
// check for forced flush
int cs = getBufferSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (this.buffer.size() > 0 && (l++ < 100) && (this.buffer.getBufferMaxReferences() > wCacheMaxChunk)) {
flushCacheOne(this.buffer);
}
// next flush more entries if the size exceeds the maximum size of the cache
while (this.buffer.size() > 0 &&
((this.buffer.size() > this.buffer.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
flushCacheOne(this.buffer);
}
if (getBufferSize() != cs) serverProfiling.update("wordcache", Long.valueOf(getBufferSize()), true);
}
}
public void cleanupBuffer(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
}
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && buffer.size() > 0) {
flushCacheOne(buffer);
}
}
private synchronized void flushCacheOne(final IndexBuffer<ReferenceType> ram) {
if (ram.size() > 0) collections.add(flushContainer(ram));
}
private ReferenceContainer<ReferenceType> flushContainer(final IndexBuffer<ReferenceType> ram) {
byte[] wordHash;
ReferenceContainer<ReferenceType> c;
wordHash = ram.maxScoreWordHash();
c = ram.get(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.delete(wordHash);
} else {
return ram.delete(ram.bestFlushWordHash());
}
}
public int getBackendSize() {
return collections.size();
}
public int getBufferSize() {
return buffer.size();
}
public ByteOrder ordering() {
return collections.ordering();
}
public CloneableIterator<ReferenceContainer<ReferenceType>> references(byte[] startWordHash, boolean rot) {
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, this.buffer.ordering().clone());
return new MergeIterator<ReferenceContainer<ReferenceType>>(
this.buffer.references(startWordHash, false),
this.collections.references(startWordHash, false),
containerOrder,
ReferenceContainer.containerMergeMethod,
true);
}
}

@ -1,348 +0,0 @@
// IndexCache.java
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2005 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.ScoreCluster;
import de.anomic.kelondro.util.Log;
/**
* A IndexCache is a ReferenceContainerCache with an attached cache flush logic
*
*/
public final class IndexBuffer<ReferenceType extends Reference> extends AbstractIndex<ReferenceType> implements Index<ReferenceType>, IndexReader<ReferenceType>, Iterable<ReferenceContainer<ReferenceType>> {
// class variables
private final ScoreCluster<byte[]> hashScore;
private final ScoreCluster<byte[]> hashDate;
private long initTime;
private int cacheEntityMaxCount; // the maximum number of cache slots for RWI entries
public int cacheReferenceCountLimit; // the maximum number of references to a single RWI entity
public long cacheReferenceAgeLimit; // the maximum age (= time not changed) of a RWI entity
private final Log log;
private final File dumpFile;
private ReferenceContainerCache<ReferenceType> heap;
@SuppressWarnings("unchecked")
public IndexBuffer(
final File databaseRoot,
final ReferenceFactory<ReferenceType> factory,
final ByteOrder wordOrdering,
final Row payloadrow,
final int entityCacheMaxSize,
final int wCacheReferenceCountLimitInit,
final long wCacheReferenceAgeLimitInit,
final String newHeapName,
final Log log) {
super(factory);
// creates a new index cache
// the cache has a back-end where indexes that do not fit in the cache are flushed
this.hashScore = new ScoreCluster<byte[]>(Base64Order.enhancedCoder);
this.hashDate = new ScoreCluster<byte[]>(Base64Order.enhancedCoder);
this.initTime = System.currentTimeMillis();
this.cacheEntityMaxCount = entityCacheMaxSize;
this.cacheReferenceCountLimit = wCacheReferenceCountLimitInit;
this.cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit;
this.log = log;
this.dumpFile = new File(databaseRoot, newHeapName);
this.heap = new ReferenceContainerCache(factory, payloadrow, wordOrdering);
// read in dump of last session
boolean initFailed = false;
if (dumpFile.exists()) try {
heap.initWriteModeFromBLOB(dumpFile);
} catch (IOException e) {
initFailed = true;
e.printStackTrace();
}
if (initFailed) {
log.logSevere("unable to restore cache dump");
// get empty dump
heap.initWriteMode();
} else if (dumpFile.exists()) {
// initialize scores for cache organization
for (final ReferenceContainer ic : (Iterable<ReferenceContainer>) heap.references(null, false)) {
this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote()));
this.hashScore.setScore(ic.getTermHash(), ic.size());
}
} else {
heap.initWriteMode();
}
}
/**
* clear the content
* @throws IOException
*/
public void clear() {
hashScore.clear();
hashDate.clear();
initTime = System.currentTimeMillis();
heap.clear();
}
public int minMem() {
// there is no specific large array that needs to be maintained
// this value is just a guess of the possible overhead
return 100 * 1024; // 100 kb
}
// cache settings
public int getBufferMaxReferences() {
if (hashScore.size() == 0) return 0;
return hashScore.getMaxScore();
}
public long getBufferMinAge() {
if (hashDate.size() == 0) return 0;
return System.currentTimeMillis() - longEmit(hashDate.getMaxScore());
}
public long getBufferMaxAge() {
if (hashDate.size() == 0) return 0;
return System.currentTimeMillis() - longEmit(hashDate.getMinScore());
}
public void setMaxWordCount(final int maxWords) {
this.cacheEntityMaxCount = maxWords;
}
public int getMaxWordCount() {
return this.cacheEntityMaxCount;
}
public int size() {
if (heap == null) return 0;
return heap.size();
}
public synchronized CloneableIterator<ReferenceContainer<ReferenceType>> references(final byte[] startWordHash, final boolean rot) {
// we return an iterator object that creates top-level-clones of the indexContainers
// in the cache, so that manipulations of the iterated objects do not change
// objects in the cache.
return heap.references(startWordHash, rot);
}
public synchronized byte[] maxScoreWordHash() {
if (heap == null || heap.size() == 0) return null;
try {
return hashScore.getMaxObject();
} catch (final Exception e) {
log.logSevere("flushFromMem: " + e.getMessage(), e);
}
return null;
}
public byte[] bestFlushWordHash() {
// select appropriate hash
// we have 2 different methods to find a good hash:
// - the oldest entry in the cache
// - the entry with maximum count
if (heap == null || heap.size() == 0) return null;
try {
//return hashScore.getMaxObject();
byte[] hash = null;
final int count = hashScore.getMaxScore();
if ((count >= cacheReferenceCountLimit) &&
((hash = hashScore.getMaxObject()) != null)) {
// we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails
// in this cache we MUST NOT check wCacheMinAge
return hash;
}
final long oldestTime = longEmit(hashDate.getMinScore());
if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit) &&
((hash = hashDate.getMinObject()) != null)) {
// flush out-dated entries
return hash;
}
// cases with respect to memory situation
if (MemoryControl.free() < 100000) {
// urgent low-memory case
hash = hashScore.getMaxObject(); // flush high-score entries (saves RAM)
} else {
// not-efficient-so-far case. cleans up unnecessary cache slots
hash = hashDate.getMinObject(); // flush oldest entries
}
if (hash == null) {
final ReferenceContainer<ReferenceType> ic = heap.references(null, false).next();
if (ic != null) hash = ic.getTermHash();
}
return hash;
} catch (final Exception e) {
log.logSevere("flushFromMem: " + e.getMessage(), e);
}
return null;
}
public synchronized ArrayList<ReferenceContainer<ReferenceType>> bestFlushContainers(final int count) {
final ArrayList<ReferenceContainer<ReferenceType>> containerList = new ArrayList<ReferenceContainer<ReferenceType>>();
byte[] hash;
ReferenceContainer<ReferenceType> container;
for (int i = 0; i < count; i++) {
hash = bestFlushWordHash();
if (hash == null) return containerList;
container = heap.delete(hash);
assert (container != null);
if (container == null) return containerList;
hashScore.deleteScore(hash);
hashDate.deleteScore(hash);
containerList.add(container);
}
return containerList;
}
private int intTime(final long longTime) {
return (int) Math.max(0, ((longTime - initTime) / 1000));
}
private long longEmit(final int intTime) {
return (((long) intTime) * (long) 1000) + initTime;
}
public boolean has(final byte[] wordHash) {
return heap.has(wordHash);
}
public int count(byte[] key) {
return this.heap.count(key);
}
public synchronized ReferenceContainer<ReferenceType> get(final byte[] wordHash, final Set<String> urlselection) {
if (wordHash == null) return null;
// retrieve container
ReferenceContainer<ReferenceType> container = heap.get(wordHash, null);
// We must not use the container from cache to store everything we find,
// as that container remains linked to in the cache and might be changed later
// while the returned container is still in use.
// create a clone from the container
if (container != null) container = container.topLevelClone();
// select the urlselection
if ((urlselection != null) && (container != null)) container.select(urlselection);
return container;
}
public synchronized ReferenceContainer<ReferenceType> delete(final byte[] wordHash) {
// returns the index that had been deleted
if (wordHash == null || heap == null) return null;
final ReferenceContainer<ReferenceType> container = heap.delete(wordHash);
hashScore.deleteScore(wordHash);
hashDate.deleteScore(wordHash);
return container;
}
public synchronized boolean remove(final byte[] wordHash, final String urlHash) {
final boolean removed = heap.remove(wordHash, urlHash);
if (removed) {
if (heap.has(wordHash)) {
hashScore.decScore(wordHash);
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
} else {
hashScore.deleteScore(wordHash);
hashDate.deleteScore(wordHash);
}
return true;
}
return false;
}
public synchronized int remove(final byte[] wordHash, final Set<String> urlHashes) {
if (urlHashes.size() == 0) return 0;
final int c = heap.remove(wordHash, urlHashes);
if (c > 0) {
// removal successful
if (heap.has(wordHash)) {
hashScore.addScore(wordHash, -c);
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
} else {
hashScore.deleteScore(wordHash);
hashDate.deleteScore(wordHash);
}
return c;
}
return 0;
}
public synchronized void add(final ReferenceContainer<ReferenceType> container) {
if (container == null || container.size() == 0 || heap == null) return;
// put new words into cache
heap.add(container);
hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash()));
hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis()));
}
public void add(final byte[] wordHash, final ReferenceType entry) throws IOException {
if (entry == null || heap == null) return;
// put new words into cache
heap.add(wordHash, entry);
hashScore.incScore(wordHash);
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
}
public synchronized void close() {
heap.dump(this.dumpFile);
heap = null;
hashScore.clear();
hashDate.clear();
}
public Iterator<ReferenceContainer<ReferenceType>> iterator() {
return references(null, false);
}
public ByteOrder ordering() {
return heap.ordering();
}
public synchronized long getBufferSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = WordReferenceRow.urlEntryRow.objectsize;
final Iterator<ReferenceContainer<ReferenceType>> it = references(null, false);
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
return cacheBytes;
}
}

@ -60,7 +60,7 @@ import de.anomic.kelondro.util.kelondroOutOfLimitsException;
import de.anomic.kelondro.util.Log;
import de.anomic.yacy.yacyURL;
public class IndexCollection<ReferenceType extends Reference> extends AbstractIndex<ReferenceType> implements Index<ReferenceType> {
public class IndexCollection<ReferenceType extends Reference> extends AbstractIndex<ReferenceType> {
private static final int loadfactor = 4;
private static final int serialNumber = 0;

@ -107,7 +107,7 @@ public final class IndexCollectionMigration<ReferenceType extends Reference> ext
factory,
12,
Base64Order.enhancedCoder,
BufferedIndexCollection.maxCollectionPartition,
7,
WordReferenceRow.urlEntryRow,
false);
if (this.collections.size() == 0) {

@ -321,7 +321,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final boolean useCommons = getConfigBool("index.storeCommons", false);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int paritionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
final boolean useCell = sb.getConfigBool("useCell", false);
try {
webIndex = new plasmaWordIndex(
networkName,
@ -331,8 +330,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
wordCacheMaxCount,
useCommons,
redundancy,
paritionExponent,
useCell);
paritionExponent);
} catch (IOException e1) {
e1.printStackTrace();
webIndex = null;
@ -800,7 +798,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
final boolean useCommons = getConfigBool("index.storeCommons", false);
final int redundancy = (int) sb.getConfigLong("network.unit.dhtredundancy.senior", 1);
final int paritionExponent = (int) sb.getConfigLong("network.unit.dht.partitionExponent", 0);
final boolean useCell = sb.getConfigBool("useCell", false);
try {
this.webIndex = new plasmaWordIndex(
getConfig(plasmaSwitchboardConstants.NETWORK_NAME, ""),
@ -810,8 +807,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
wordCacheMaxCount,
useCommons,
redundancy,
paritionExponent,
useCell);
paritionExponent);
} catch (IOException e) {
e.printStackTrace();
this.webIndex = null;

@ -45,7 +45,6 @@ import de.anomic.kelondro.blob.BLOBArray;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.BufferedIndex;
import de.anomic.kelondro.text.BufferedIndexCollection;
import de.anomic.kelondro.text.IndexCell;
import de.anomic.kelondro.text.IndexCollectionMigration;
import de.anomic.kelondro.text.ReferenceContainer;
@ -123,8 +122,7 @@ public final class plasmaWordIndex {
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
final int partitionExponent,
final boolean useCell) throws IOException {
final int partitionExponent) throws IOException {
log.logInfo("Initializing Word Index for the network '" + networkName + "', word hash cache size is " + Word.hashCacheSize + ".");
@ -154,10 +152,9 @@ public final class plasmaWordIndex {
// check if the peer has migrated the index
if (new File(indexPrimaryTextLocation, "RICOLLECTION").exists()) {
this.merger = (useCell) ? new IODispatcher<WordReference>(plasmaWordIndex.wordReferenceFactory, 1, 1) : null;
this.merger = new IODispatcher<WordReference>(plasmaWordIndex.wordReferenceFactory, 1, 1);
if (this.merger != null) this.merger.start();
this.index = (useCell) ?
new IndexCollectionMigration<WordReference>(
this.index = new IndexCollectionMigration<WordReference>(
indexPrimaryTextLocation,
wordReferenceFactory,
wordOrder,
@ -166,16 +163,6 @@ public final class plasmaWordIndex {
targetFileSize,
maxFileSize,
this.merger,
log)
:
new BufferedIndexCollection<WordReference>(
indexPrimaryTextLocation,
wordReferenceFactory,
wordOrder,
WordReferenceRow.urlEntryRow,
entityCacheMaxSize,
useCommons,
redundancy,
log);
} else {
this.merger = new IODispatcher<WordReference>(plasmaWordIndex.wordReferenceFactory, 1, 1);

@ -328,6 +328,7 @@ public class Dispatcher {
* This method returns true if a container was dequeued, false if not
*/
public boolean dequeueContainer() {
if (transmissionCloud == null) return false;
if (this.indexingTransmissionProcessor.queueSize() > indexingTransmissionProcessor.concurrency()) return false;
byte[] maxtarget = null;
int maxsize = -1;

@ -675,7 +675,7 @@ public final class yacy {
final int cacheMem = (int)(MemoryControl.maxMemory - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0, false);
final plasmaWordIndex wordIndex = new plasmaWordIndex(networkName, log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.index().references("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0;
@ -866,7 +866,7 @@ public final class yacy {
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0, false);
WordIndex = new plasmaWordIndex("freeworld", log, indexPrimaryRoot, indexSecondaryRoot, 10000, false, 1, 0);
indexContainerIterator = WordIndex.index().references(wordChunkStartHash.getBytes(), false, false);
}
int counter = 0;

Loading…
Cancel
Save