- removed useCell option: the indexCell data structure is now the default index structure; old collection data is still migrated
- added some debugging output to balancer to find a bug - removed unused classes for index collection handling - changed some default values for the process handling: more memory needed to prevent OOM git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5856 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
1b9e532c87
commit
138422990a
@ -1,344 +0,0 @@
|
|||||||
// BufferedIndexCollection.java
|
|
||||||
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
||||||
// first published 2005 on http://www.anomic.de
|
|
||||||
//
|
|
||||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
||||||
//
|
|
||||||
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
|
|
||||||
// $LastChangedRevision: 5709 $
|
|
||||||
// $LastChangedBy: orbiter $
|
|
||||||
//
|
|
||||||
// LICENSE
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package de.anomic.kelondro.text;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import de.anomic.kelondro.index.Row;
|
|
||||||
import de.anomic.kelondro.index.RowCollection;
|
|
||||||
import de.anomic.kelondro.order.Base64Order;
|
|
||||||
import de.anomic.kelondro.order.ByteOrder;
|
|
||||||
import de.anomic.kelondro.order.CloneableIterator;
|
|
||||||
import de.anomic.kelondro.order.MergeIterator;
|
|
||||||
import de.anomic.kelondro.order.Order;
|
|
||||||
import de.anomic.kelondro.order.RotateIterator;
|
|
||||||
import de.anomic.kelondro.text.Index;
|
|
||||||
import de.anomic.kelondro.text.IndexBuffer;
|
|
||||||
import de.anomic.kelondro.text.IndexCollection;
|
|
||||||
import de.anomic.kelondro.text.ReferenceContainer;
|
|
||||||
import de.anomic.kelondro.text.ReferenceContainerOrder;
|
|
||||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
|
||||||
import de.anomic.kelondro.util.FileUtils;
|
|
||||||
import de.anomic.kelondro.util.MemoryControl;
|
|
||||||
import de.anomic.kelondro.util.Log;
|
|
||||||
import de.anomic.server.serverProfiling;
|
|
||||||
|
|
||||||
public final class BufferedIndexCollection<ReferenceType extends Reference> extends AbstractBufferedIndex<ReferenceType> implements Index<ReferenceType>, BufferedIndex<ReferenceType> {
|
|
||||||
|
|
||||||
// environment constants
|
|
||||||
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
|
|
||||||
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
|
|
||||||
public static final int lowcachedivisor = 900;
|
|
||||||
public static final int maxCollectionPartition = 7; // should be 7
|
|
||||||
|
|
||||||
private final IndexBuffer<ReferenceType> buffer;
|
|
||||||
private final IndexCollection<ReferenceType> collections;
|
|
||||||
|
|
||||||
public BufferedIndexCollection (
|
|
||||||
File indexPrimaryTextLocation,
|
|
||||||
final ReferenceFactory<ReferenceType> factory,
|
|
||||||
final ByteOrder wordOrdering,
|
|
||||||
final Row payloadrow,
|
|
||||||
final int entityCacheMaxSize,
|
|
||||||
final boolean useCommons,
|
|
||||||
final int redundancy,
|
|
||||||
Log log) throws IOException {
|
|
||||||
super(factory);
|
|
||||||
|
|
||||||
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
|
|
||||||
if (!(textindexcache.exists())) textindexcache.mkdirs();
|
|
||||||
if (new File(textindexcache, "index.dhtin.blob").exists()) {
|
|
||||||
// migration of the both caches into one
|
|
||||||
this.buffer = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
|
||||||
IndexBuffer<ReferenceType> dhtInCache = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
|
|
||||||
for (ReferenceContainer<ReferenceType> c: dhtInCache) {
|
|
||||||
this.buffer.add(c);
|
|
||||||
}
|
|
||||||
FileUtils.deletedelete(new File(textindexcache, "index.dhtin.blob"));
|
|
||||||
} else {
|
|
||||||
// read in new BLOB
|
|
||||||
this.buffer = new IndexBuffer<ReferenceType>(textindexcache, factory, wordOrdering, payloadrow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
|
|
||||||
}
|
|
||||||
|
|
||||||
// create collections storage path
|
|
||||||
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
|
|
||||||
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
|
|
||||||
this.collections = new IndexCollection<ReferenceType>(
|
|
||||||
textindexcollections,
|
|
||||||
"collection",
|
|
||||||
factory,
|
|
||||||
12,
|
|
||||||
Base64Order.enhancedCoder,
|
|
||||||
maxCollectionPartition,
|
|
||||||
WordReferenceRow.urlEntryRow,
|
|
||||||
useCommons);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* methods for interface Index */
|
|
||||||
|
|
||||||
public void add(final ReferenceContainer<ReferenceType> entries) {
|
|
||||||
assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize);
|
|
||||||
|
|
||||||
// add the entry
|
|
||||||
buffer.add(entries);
|
|
||||||
cacheFlushControl();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void add(final byte[] wordHash, final ReferenceType entry) throws IOException {
|
|
||||||
// add the entry
|
|
||||||
buffer.add(wordHash, entry);
|
|
||||||
cacheFlushControl();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean has(final byte[] wordHash) {
|
|
||||||
if (buffer.has(wordHash)) return true;
|
|
||||||
if (collections.has(wordHash)) return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int count(byte[] key) {
|
|
||||||
return buffer.count(key) + collections.count(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ReferenceContainer<ReferenceType> get(final byte[] wordHash, final Set<String> urlselection) {
|
|
||||||
if (wordHash == null) {
|
|
||||||
// wrong input
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
// get from cache
|
|
||||||
ReferenceContainer<ReferenceType> container;
|
|
||||||
container = buffer.get(wordHash, urlselection);
|
|
||||||
|
|
||||||
// get from collection index
|
|
||||||
if (container == null) {
|
|
||||||
container = collections.get(wordHash, urlselection);
|
|
||||||
} else {
|
|
||||||
container.addAllUnique(collections.get(wordHash, urlselection));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (container == null) return null;
|
|
||||||
|
|
||||||
// check doubles
|
|
||||||
final int beforeDouble = container.size();
|
|
||||||
container.sort();
|
|
||||||
final ArrayList<RowCollection> d = container.removeDoubles();
|
|
||||||
RowCollection set;
|
|
||||||
for (int i = 0; i < d.size(); i++) {
|
|
||||||
// for each element in the double-set, take that one that is the most recent one
|
|
||||||
set = d.get(i);
|
|
||||||
WordReferenceRow e, elm = null;
|
|
||||||
long lm = 0;
|
|
||||||
for (int j = 0; j < set.size(); j++) {
|
|
||||||
e = new WordReferenceRow(set.get(j, true));
|
|
||||||
if ((elm == null) || (e.lastModified() > lm)) {
|
|
||||||
elm = e;
|
|
||||||
lm = e.lastModified();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(elm != null) {
|
|
||||||
container.addUnique(elm.toKelondroEntry());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getTermHashAsString());
|
|
||||||
|
|
||||||
return container;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ReferenceContainer<ReferenceType> delete(final byte[] wordHash) {
|
|
||||||
final ReferenceContainer<ReferenceType> c = new ReferenceContainer<ReferenceType>(
|
|
||||||
factory,
|
|
||||||
wordHash,
|
|
||||||
WordReferenceRow.urlEntryRow,
|
|
||||||
buffer.count(wordHash));
|
|
||||||
c.addAllUnique(buffer.delete(wordHash));
|
|
||||||
c.addAllUnique(collections.delete(wordHash));
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean remove(final byte[] wordHash, final String urlHash) {
|
|
||||||
boolean removed = false;
|
|
||||||
removed = removed | (buffer.remove(wordHash, urlHash));
|
|
||||||
removed = removed | (collections.remove(wordHash, urlHash));
|
|
||||||
return removed;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int remove(final byte[] wordHash, final Set<String> urlHashes) {
|
|
||||||
int removed = 0;
|
|
||||||
removed += buffer.remove(wordHash, urlHashes);
|
|
||||||
removed += collections.remove(wordHash, urlHashes);
|
|
||||||
return removed;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized CloneableIterator<ReferenceContainer<ReferenceType>> references(final byte[] startHash, final boolean rot, final boolean ram) throws IOException {
|
|
||||||
final CloneableIterator<ReferenceContainer<ReferenceType>> i = wordContainers(startHash, ram);
|
|
||||||
if (rot) {
|
|
||||||
return new RotateIterator<ReferenceContainer<ReferenceType>>(i, Base64Order.zero(startHash.length), buffer.size() + ((ram) ? 0 : collections.size()));
|
|
||||||
}
|
|
||||||
return i;
|
|
||||||
}
|
|
||||||
|
|
||||||
private synchronized CloneableIterator<ReferenceContainer<ReferenceType>> wordContainers(final byte[] startWordHash, final boolean ram) throws IOException {
|
|
||||||
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, buffer.ordering().clone());
|
|
||||||
ReferenceContainer<ReferenceType> emptyContainer = ReferenceContainer.emptyContainer(factory, startWordHash, 0);
|
|
||||||
containerOrder.rotate(emptyContainer);
|
|
||||||
if (ram) {
|
|
||||||
return buffer.references(startWordHash, false);
|
|
||||||
}
|
|
||||||
return collections.references(startWordHash, false);
|
|
||||||
/*
|
|
||||||
return new MergeIterator<ReferenceContainer>(
|
|
||||||
indexCache.referenceIterator(startWordHash, false, true),
|
|
||||||
collections.referenceIterator(startWordHash, false, false),
|
|
||||||
containerOrder,
|
|
||||||
ReferenceContainer.containerMergeMethod,
|
|
||||||
true);
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
buffer.clear();
|
|
||||||
try {
|
|
||||||
collections.clear();
|
|
||||||
} catch (IOException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() {
|
|
||||||
buffer.close();
|
|
||||||
collections.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return java.lang.Math.max(collections.size(), buffer.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
public int minMem() {
|
|
||||||
return 1024*1024 /* indexing overhead */ + buffer.minMem() + collections.minMem();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
|
||||||
* methods for cache management
|
|
||||||
*/
|
|
||||||
|
|
||||||
public int getBufferMaxReferences() {
|
|
||||||
return buffer.getBufferMaxReferences();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBufferMinAge() {
|
|
||||||
return buffer.getBufferMinAge();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBufferMaxAge() {
|
|
||||||
return buffer.getBufferMaxAge();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBufferSizeBytes() {
|
|
||||||
return buffer.getBufferSizeBytes();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setBufferMaxWordCount(final int maxWords) {
|
|
||||||
buffer.setMaxWordCount(maxWords);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void cacheFlushControl() {
|
|
||||||
// check for forced flush
|
|
||||||
int cs = getBufferSize();
|
|
||||||
if (cs > 0) {
|
|
||||||
// flush elements that are too big. This flushing depends on the fact that the flush rule
|
|
||||||
// selects the biggest elements first for flushing. If it does not for any reason, the following
|
|
||||||
// loop would not terminate.
|
|
||||||
serverProfiling.update("wordcache", Long.valueOf(cs), true);
|
|
||||||
// To ensure termination an additional counter is used
|
|
||||||
int l = 0;
|
|
||||||
while (this.buffer.size() > 0 && (l++ < 100) && (this.buffer.getBufferMaxReferences() > wCacheMaxChunk)) {
|
|
||||||
flushCacheOne(this.buffer);
|
|
||||||
}
|
|
||||||
// next flush more entries if the size exceeds the maximum size of the cache
|
|
||||||
while (this.buffer.size() > 0 &&
|
|
||||||
((this.buffer.size() > this.buffer.getMaxWordCount()) ||
|
|
||||||
(MemoryControl.available() < collections.minMem()))) {
|
|
||||||
flushCacheOne(this.buffer);
|
|
||||||
}
|
|
||||||
if (getBufferSize() != cs) serverProfiling.update("wordcache", Long.valueOf(getBufferSize()), true);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void cleanupBuffer(int time) {
|
|
||||||
flushCacheUntil(System.currentTimeMillis() + time);
|
|
||||||
}
|
|
||||||
|
|
||||||
private synchronized void flushCacheUntil(long timeout) {
|
|
||||||
while (System.currentTimeMillis() < timeout && buffer.size() > 0) {
|
|
||||||
flushCacheOne(buffer);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private synchronized void flushCacheOne(final IndexBuffer<ReferenceType> ram) {
|
|
||||||
if (ram.size() > 0) collections.add(flushContainer(ram));
|
|
||||||
}
|
|
||||||
|
|
||||||
private ReferenceContainer<ReferenceType> flushContainer(final IndexBuffer<ReferenceType> ram) {
|
|
||||||
byte[] wordHash;
|
|
||||||
ReferenceContainer<ReferenceType> c;
|
|
||||||
wordHash = ram.maxScoreWordHash();
|
|
||||||
c = ram.get(wordHash, null);
|
|
||||||
if ((c != null) && (c.size() > wCacheMaxChunk)) {
|
|
||||||
return ram.delete(wordHash);
|
|
||||||
} else {
|
|
||||||
return ram.delete(ram.bestFlushWordHash());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getBackendSize() {
|
|
||||||
return collections.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getBufferSize() {
|
|
||||||
return buffer.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public ByteOrder ordering() {
|
|
||||||
return collections.ordering();
|
|
||||||
}
|
|
||||||
|
|
||||||
public CloneableIterator<ReferenceContainer<ReferenceType>> references(byte[] startWordHash, boolean rot) {
|
|
||||||
final Order<ReferenceContainer<ReferenceType>> containerOrder = new ReferenceContainerOrder<ReferenceType>(factory, this.buffer.ordering().clone());
|
|
||||||
return new MergeIterator<ReferenceContainer<ReferenceType>>(
|
|
||||||
this.buffer.references(startWordHash, false),
|
|
||||||
this.collections.references(startWordHash, false),
|
|
||||||
containerOrder,
|
|
||||||
ReferenceContainer.containerMergeMethod,
|
|
||||||
true);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,348 +0,0 @@
|
|||||||
// IndexCache.java
|
|
||||||
// (C) 2005, 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
||||||
// first published 2005 on http://yacy.net
|
|
||||||
//
|
|
||||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// LICENSE
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package de.anomic.kelondro.text;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import de.anomic.kelondro.index.Row;
|
|
||||||
import de.anomic.kelondro.order.Base64Order;
|
|
||||||
import de.anomic.kelondro.order.ByteOrder;
|
|
||||||
import de.anomic.kelondro.order.CloneableIterator;
|
|
||||||
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
|
|
||||||
import de.anomic.kelondro.util.MemoryControl;
|
|
||||||
import de.anomic.kelondro.util.ScoreCluster;
|
|
||||||
import de.anomic.kelondro.util.Log;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A IndexCache is a ReferenceContainerCache with an attached cache flush logic
|
|
||||||
*
|
|
||||||
*/
|
|
||||||
public final class IndexBuffer<ReferenceType extends Reference> extends AbstractIndex<ReferenceType> implements Index<ReferenceType>, IndexReader<ReferenceType>, Iterable<ReferenceContainer<ReferenceType>> {
|
|
||||||
|
|
||||||
// class variables
|
|
||||||
private final ScoreCluster<byte[]> hashScore;
|
|
||||||
private final ScoreCluster<byte[]> hashDate;
|
|
||||||
private long initTime;
|
|
||||||
private int cacheEntityMaxCount; // the maximum number of cache slots for RWI entries
|
|
||||||
public int cacheReferenceCountLimit; // the maximum number of references to a single RWI entity
|
|
||||||
public long cacheReferenceAgeLimit; // the maximum age (= time not changed) of a RWI entity
|
|
||||||
private final Log log;
|
|
||||||
private final File dumpFile;
|
|
||||||
private ReferenceContainerCache<ReferenceType> heap;
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public IndexBuffer(
|
|
||||||
final File databaseRoot,
|
|
||||||
final ReferenceFactory<ReferenceType> factory,
|
|
||||||
final ByteOrder wordOrdering,
|
|
||||||
final Row payloadrow,
|
|
||||||
final int entityCacheMaxSize,
|
|
||||||
final int wCacheReferenceCountLimitInit,
|
|
||||||
final long wCacheReferenceAgeLimitInit,
|
|
||||||
final String newHeapName,
|
|
||||||
final Log log) {
|
|
||||||
super(factory);
|
|
||||||
|
|
||||||
// creates a new index cache
|
|
||||||
// the cache has a back-end where indexes that do not fit in the cache are flushed
|
|
||||||
this.hashScore = new ScoreCluster<byte[]>(Base64Order.enhancedCoder);
|
|
||||||
this.hashDate = new ScoreCluster<byte[]>(Base64Order.enhancedCoder);
|
|
||||||
this.initTime = System.currentTimeMillis();
|
|
||||||
this.cacheEntityMaxCount = entityCacheMaxSize;
|
|
||||||
this.cacheReferenceCountLimit = wCacheReferenceCountLimitInit;
|
|
||||||
this.cacheReferenceAgeLimit = wCacheReferenceAgeLimitInit;
|
|
||||||
this.log = log;
|
|
||||||
this.dumpFile = new File(databaseRoot, newHeapName);
|
|
||||||
this.heap = new ReferenceContainerCache(factory, payloadrow, wordOrdering);
|
|
||||||
|
|
||||||
// read in dump of last session
|
|
||||||
boolean initFailed = false;
|
|
||||||
if (dumpFile.exists()) try {
|
|
||||||
heap.initWriteModeFromBLOB(dumpFile);
|
|
||||||
} catch (IOException e) {
|
|
||||||
initFailed = true;
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
if (initFailed) {
|
|
||||||
log.logSevere("unable to restore cache dump");
|
|
||||||
// get empty dump
|
|
||||||
heap.initWriteMode();
|
|
||||||
} else if (dumpFile.exists()) {
|
|
||||||
// initialize scores for cache organization
|
|
||||||
for (final ReferenceContainer ic : (Iterable<ReferenceContainer>) heap.references(null, false)) {
|
|
||||||
this.hashDate.setScore(ic.getTermHash(), intTime(ic.lastWrote()));
|
|
||||||
this.hashScore.setScore(ic.getTermHash(), ic.size());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
heap.initWriteMode();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* clear the content
|
|
||||||
* @throws IOException
|
|
||||||
*/
|
|
||||||
public void clear() {
|
|
||||||
hashScore.clear();
|
|
||||||
hashDate.clear();
|
|
||||||
initTime = System.currentTimeMillis();
|
|
||||||
heap.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int minMem() {
|
|
||||||
// there is no specific large array that needs to be maintained
|
|
||||||
// this value is just a guess of the possible overhead
|
|
||||||
return 100 * 1024; // 100 kb
|
|
||||||
}
|
|
||||||
|
|
||||||
// cache settings
|
|
||||||
public int getBufferMaxReferences() {
|
|
||||||
if (hashScore.size() == 0) return 0;
|
|
||||||
return hashScore.getMaxScore();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBufferMinAge() {
|
|
||||||
if (hashDate.size() == 0) return 0;
|
|
||||||
return System.currentTimeMillis() - longEmit(hashDate.getMaxScore());
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getBufferMaxAge() {
|
|
||||||
if (hashDate.size() == 0) return 0;
|
|
||||||
return System.currentTimeMillis() - longEmit(hashDate.getMinScore());
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMaxWordCount(final int maxWords) {
|
|
||||||
this.cacheEntityMaxCount = maxWords;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxWordCount() {
|
|
||||||
return this.cacheEntityMaxCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
if (heap == null) return 0;
|
|
||||||
return heap.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized CloneableIterator<ReferenceContainer<ReferenceType>> references(final byte[] startWordHash, final boolean rot) {
|
|
||||||
// we return an iterator object that creates top-level-clones of the indexContainers
|
|
||||||
// in the cache, so that manipulations of the iterated objects do not change
|
|
||||||
// objects in the cache.
|
|
||||||
return heap.references(startWordHash, rot);
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized byte[] maxScoreWordHash() {
|
|
||||||
if (heap == null || heap.size() == 0) return null;
|
|
||||||
try {
|
|
||||||
return hashScore.getMaxObject();
|
|
||||||
} catch (final Exception e) {
|
|
||||||
log.logSevere("flushFromMem: " + e.getMessage(), e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public byte[] bestFlushWordHash() {
|
|
||||||
// select appropriate hash
|
|
||||||
// we have 2 different methods to find a good hash:
|
|
||||||
// - the oldest entry in the cache
|
|
||||||
// - the entry with maximum count
|
|
||||||
if (heap == null || heap.size() == 0) return null;
|
|
||||||
try {
|
|
||||||
//return hashScore.getMaxObject();
|
|
||||||
byte[] hash = null;
|
|
||||||
final int count = hashScore.getMaxScore();
|
|
||||||
if ((count >= cacheReferenceCountLimit) &&
|
|
||||||
((hash = hashScore.getMaxObject()) != null)) {
|
|
||||||
// we MUST flush high-score entries, because a loop deletes entries in cache until this condition fails
|
|
||||||
// in this cache we MUST NOT check wCacheMinAge
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
final long oldestTime = longEmit(hashDate.getMinScore());
|
|
||||||
if (((System.currentTimeMillis() - oldestTime) > cacheReferenceAgeLimit) &&
|
|
||||||
((hash = hashDate.getMinObject()) != null)) {
|
|
||||||
// flush out-dated entries
|
|
||||||
return hash;
|
|
||||||
}
|
|
||||||
// cases with respect to memory situation
|
|
||||||
if (MemoryControl.free() < 100000) {
|
|
||||||
// urgent low-memory case
|
|
||||||
hash = hashScore.getMaxObject(); // flush high-score entries (saves RAM)
|
|
||||||
} else {
|
|
||||||
// not-efficient-so-far case. cleans up unnecessary cache slots
|
|
||||||
hash = hashDate.getMinObject(); // flush oldest entries
|
|
||||||
}
|
|
||||||
if (hash == null) {
|
|
||||||
final ReferenceContainer<ReferenceType> ic = heap.references(null, false).next();
|
|
||||||
if (ic != null) hash = ic.getTermHash();
|
|
||||||
}
|
|
||||||
return hash;
|
|
||||||
|
|
||||||
} catch (final Exception e) {
|
|
||||||
log.logSevere("flushFromMem: " + e.getMessage(), e);
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized ArrayList<ReferenceContainer<ReferenceType>> bestFlushContainers(final int count) {
|
|
||||||
final ArrayList<ReferenceContainer<ReferenceType>> containerList = new ArrayList<ReferenceContainer<ReferenceType>>();
|
|
||||||
byte[] hash;
|
|
||||||
ReferenceContainer<ReferenceType> container;
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
hash = bestFlushWordHash();
|
|
||||||
if (hash == null) return containerList;
|
|
||||||
container = heap.delete(hash);
|
|
||||||
assert (container != null);
|
|
||||||
if (container == null) return containerList;
|
|
||||||
hashScore.deleteScore(hash);
|
|
||||||
hashDate.deleteScore(hash);
|
|
||||||
containerList.add(container);
|
|
||||||
}
|
|
||||||
return containerList;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int intTime(final long longTime) {
|
|
||||||
return (int) Math.max(0, ((longTime - initTime) / 1000));
|
|
||||||
}
|
|
||||||
|
|
||||||
private long longEmit(final int intTime) {
|
|
||||||
return (((long) intTime) * (long) 1000) + initTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean has(final byte[] wordHash) {
|
|
||||||
return heap.has(wordHash);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int count(byte[] key) {
|
|
||||||
return this.heap.count(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized ReferenceContainer<ReferenceType> get(final byte[] wordHash, final Set<String> urlselection) {
|
|
||||||
if (wordHash == null) return null;
|
|
||||||
|
|
||||||
// retrieve container
|
|
||||||
ReferenceContainer<ReferenceType> container = heap.get(wordHash, null);
|
|
||||||
|
|
||||||
// We must not use the container from cache to store everything we find,
|
|
||||||
// as that container remains linked to in the cache and might be changed later
|
|
||||||
// while the returned container is still in use.
|
|
||||||
// create a clone from the container
|
|
||||||
if (container != null) container = container.topLevelClone();
|
|
||||||
|
|
||||||
// select the urlselection
|
|
||||||
if ((urlselection != null) && (container != null)) container.select(urlselection);
|
|
||||||
|
|
||||||
return container;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized ReferenceContainer<ReferenceType> delete(final byte[] wordHash) {
|
|
||||||
// returns the index that had been deleted
|
|
||||||
if (wordHash == null || heap == null) return null;
|
|
||||||
final ReferenceContainer<ReferenceType> container = heap.delete(wordHash);
|
|
||||||
hashScore.deleteScore(wordHash);
|
|
||||||
hashDate.deleteScore(wordHash);
|
|
||||||
return container;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized boolean remove(final byte[] wordHash, final String urlHash) {
|
|
||||||
final boolean removed = heap.remove(wordHash, urlHash);
|
|
||||||
if (removed) {
|
|
||||||
if (heap.has(wordHash)) {
|
|
||||||
hashScore.decScore(wordHash);
|
|
||||||
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
|
|
||||||
} else {
|
|
||||||
hashScore.deleteScore(wordHash);
|
|
||||||
hashDate.deleteScore(wordHash);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized int remove(final byte[] wordHash, final Set<String> urlHashes) {
|
|
||||||
if (urlHashes.size() == 0) return 0;
|
|
||||||
final int c = heap.remove(wordHash, urlHashes);
|
|
||||||
if (c > 0) {
|
|
||||||
// removal successful
|
|
||||||
if (heap.has(wordHash)) {
|
|
||||||
hashScore.addScore(wordHash, -c);
|
|
||||||
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
|
|
||||||
} else {
|
|
||||||
hashScore.deleteScore(wordHash);
|
|
||||||
hashDate.deleteScore(wordHash);
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized void add(final ReferenceContainer<ReferenceType> container) {
|
|
||||||
if (container == null || container.size() == 0 || heap == null) return;
|
|
||||||
|
|
||||||
// put new words into cache
|
|
||||||
heap.add(container);
|
|
||||||
hashScore.setScore(container.getTermHash(), heap.count(container.getTermHash()));
|
|
||||||
hashDate.setScore(container.getTermHash(), intTime(System.currentTimeMillis()));
|
|
||||||
}
|
|
||||||
|
|
||||||
public void add(final byte[] wordHash, final ReferenceType entry) throws IOException {
|
|
||||||
if (entry == null || heap == null) return;
|
|
||||||
|
|
||||||
// put new words into cache
|
|
||||||
heap.add(wordHash, entry);
|
|
||||||
hashScore.incScore(wordHash);
|
|
||||||
hashDate.setScore(wordHash, intTime(System.currentTimeMillis()));
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized void close() {
|
|
||||||
heap.dump(this.dumpFile);
|
|
||||||
heap = null;
|
|
||||||
hashScore.clear();
|
|
||||||
hashDate.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<ReferenceContainer<ReferenceType>> iterator() {
|
|
||||||
return references(null, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ByteOrder ordering() {
|
|
||||||
return heap.ordering();
|
|
||||||
}
|
|
||||||
|
|
||||||
public synchronized long getBufferSizeBytes() {
|
|
||||||
// calculate the real size in bytes of the index cache
|
|
||||||
long cacheBytes = 0;
|
|
||||||
final long entryBytes = WordReferenceRow.urlEntryRow.objectsize;
|
|
||||||
final Iterator<ReferenceContainer<ReferenceType>> it = references(null, false);
|
|
||||||
while (it.hasNext()) cacheBytes += it.next().size() * entryBytes;
|
|
||||||
return cacheBytes;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in new issue