refactoring of plasmaWordIndex: less methods in the class, separated the index to CachedIndexCollection
git-svn-id: 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
@ -0,0 +1,449 @@
// (C) 2005, 2006 by Michael Peter Christen;, Frankfurt a. M., Germany
// first published 2005 on
// This is a part of YaCy, a peer-to-peer based web search engine
// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $
// $LastChangedRevision: 5709 $
// $LastChangedBy: orbiter $
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.kelondro.text;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.kelondro.index.RowCollection;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.RotateIterator;
import de.anomic.kelondro.text.Index;
import de.anomic.kelondro.text.IndexCache;
import de.anomic.kelondro.text.IndexCollection;
import de.anomic.kelondro.text.ReferenceContainer;
import de.anomic.kelondro.text.ReferenceContainerOrder;
import de.anomic.kelondro.text.ReferenceRow;
import de.anomic.kelondro.text.Word;
import de.anomic.kelondro.util.MemoryControl;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverProfiling;
public final class CachedIndexCollection implements Index {
// environment constants
public static final long wCacheMaxAge = 1000 * 60 * 30; // milliseconds; 30 minutes
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
private static final ByteOrder indexOrder = Base64Order.enhancedCoder;
private final IndexCache indexCache;
private final IndexCollection collections; // new database structure to replace AssortmentCluster and FileCluster
public CachedIndexCollection(
File indexPrimaryTextLocation,
final int entityCacheMaxSize,
final boolean useCommons,
final int redundancy,
Log log) throws IOException {
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (!(textindexcache.exists())) textindexcache.mkdirs();
if (new File(textindexcache, "index.dhtin.blob").exists()) {
// migration of the both caches into one
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
IndexCache dhtInCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtin.blob", log);
for (ReferenceContainer c: dhtInCache) {
new File(textindexcache, "index.dhtin.blob").delete();
} else {
// read in new BLOB
this.indexCache = new IndexCache(textindexcache, ReferenceRow.urlEntryRow, entityCacheMaxSize, wCacheMaxChunk, wCacheMaxAge, "index.dhtout.blob", log);
// create collections storage path
final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION");
if (!(textindexcollections.exists())) textindexcollections.mkdirs();
this.collections = new IndexCollection(
public void clear() {
try {
} catch (IOException e) {
public int minMem() {
return 1024*1024 /* indexing overhead */ + indexCache.minMem() + collections.minMem();
public int maxURLinCache() {
return indexCache.maxURLinCache();
public long minAgeOfCache() {
return indexCache.minAgeOfCache();
public long maxAgeOfCache() {
return indexCache.maxAgeOfCache();
public int indexCacheSize() {
return indexCache.size();
public long indexCacheSizeBytes() {
// calculate the real size in bytes of the index cache
long cacheBytes = 0;
final long entryBytes = ReferenceRow.urlEntryRow.objectsize;
final IndexCache cache = (indexCache);
synchronized (cache) {
final Iterator<ReferenceContainer> it = cache.referenceIterator(null, false, true);
while (it.hasNext()) cacheBytes += * entryBytes;
return cacheBytes;
public void setMaxWordCount(final int maxWords) {
public void cacheFlushControl(final IndexCache theCache) {
// check for forced flush
int cs = cacheSize();
if (cs > 0) {
// flush elements that are too big. This flushing depends on the fact that the flush rule
// selects the biggest elements first for flushing. If it does not for any reason, the following
// loop would not terminate.
serverProfiling.update("wordcache", Long.valueOf(cs), true);
// To ensure termination an additional counter is used
int l = 0;
while (theCache.size() > 0 && (l++ < 100) && (theCache.maxURLinCache() > wCacheMaxChunk)) {
// next flush more entries if the size exceeds the maximum size of the cache
while (theCache.size() > 0 &&
((theCache.size() > theCache.getMaxWordCount()) ||
(MemoryControl.available() < collections.minMem()))) {
if (cacheSize() != cs) serverProfiling.update("wordcache", Long.valueOf(cacheSize()), true);
public static ReferenceContainer emptyContainer(final String wordHash, final int elementCount) {
return new ReferenceContainer(wordHash, ReferenceRow.urlEntryRow, elementCount);
public void addEntry(final String wordHash, final ReferenceRow entry, final long updateTime) {
// add the entry
indexCache.addEntry(wordHash, entry, updateTime, true);
public void addReferences(final ReferenceContainer entries) {
assert (entries.row().objectsize == ReferenceRow.urlEntryRow.objectsize);
// add the entry
public void flushCacheFor(int time) {
flushCacheUntil(System.currentTimeMillis() + time);
private synchronized void flushCacheUntil(long timeout) {
while (System.currentTimeMillis() < timeout && indexCache.size() > 0) {
private synchronized void flushCacheOne(final IndexCache ram) {
if (ram.size() > 0) collections.addReferences(flushContainer(ram));
private ReferenceContainer flushContainer(final IndexCache ram) {
String wordHash;
ReferenceContainer c;
wordHash = ram.maxScoreWordHash();
c = ram.getReferences(wordHash, null);
if ((c != null) && (c.size() > wCacheMaxChunk)) {
return ram.deleteAllReferences(wordHash);
} else {
return ram.deleteAllReferences(ram.bestFlushWordHash());
public boolean hasReferences(final String wordHash) {
if (indexCache.hasReferences(wordHash)) return true;
if (collections.hasReferences(wordHash)) return true;
return false;
public ReferenceContainer getReferences(final String wordHash, final Set<String> urlselection) {
if (wordHash == null) {
// wrong input
return null;
// get from cache
ReferenceContainer container;
container = indexCache.getReferences(wordHash, urlselection);
// get from collection index
if (container == null) {
container = collections.getReferences(wordHash, urlselection);
} else {
container.addAllUnique(collections.getReferences(wordHash, urlselection));
if (container == null) return null;
// check doubles
final int beforeDouble = container.size();
final ArrayList<RowCollection> d = container.removeDoubles();
RowCollection set;
for (int i = 0; i < d.size(); i++) {
// for each element in the double-set, take that one that is the most recent one
set = d.get(i);
ReferenceRow e, elm = null;
long lm = 0;
for (int j = 0; j < set.size(); j++) {
e = new ReferenceRow(set.get(j, true));
if ((elm == null) || (e.lastModified() > lm)) {
elm = e;
lm = e.lastModified();
if(elm != null) {
if (container.size() < beforeDouble) System.out.println("*** DEBUG DOUBLECHECK - removed " + (beforeDouble - container.size()) + " index entries from word container " + container.getWordHash());
return container;
* return map of wordhash:indexContainer
* @param wordHashes
* @param urlselection
* @param deleteIfEmpty
* @param interruptIfEmpty
* @return
public HashMap<String, ReferenceContainer> getContainers(final Set<String> wordHashes, final Set<String> urlselection, final boolean interruptIfEmpty) {
// retrieve entities that belong to the hashes
final HashMap<String, ReferenceContainer> containers = new HashMap<String, ReferenceContainer>(wordHashes.size());
String singleHash;
ReferenceContainer singleContainer;
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
// get next word hash:
singleHash =;
// retrieve index
singleContainer = getReferences(singleHash, urlselection);
// check result
if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap<String, ReferenceContainer>(0);
containers.put(singleHash, singleContainer);
return containers;
public HashMap<String, ReferenceContainer>[] localSearchContainers(
final TreeSet<String> queryHashes,
final TreeSet<String> excludeHashes,
final Set<String> urlselection) {
// search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result
// retrieve entities that belong to the hashes
HashMap<String, ReferenceContainer> inclusionContainers = (queryHashes.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < queryHashes.size())) inclusionContainers = new HashMap<String, ReferenceContainer>(0); // prevent that only a subset is returned
final HashMap<String, ReferenceContainer> exclusionContainers = (inclusionContainers.size() == 0) ? new HashMap<String, ReferenceContainer>(0) : getContainers(
return new HashMap[]{inclusionContainers, exclusionContainers};
public int size() {
return java.lang.Math.max(collections.size(), indexCache.size());
public int collectionsSize() {
return collections.size();
public int cacheSize() {
return indexCache.size();
public void close() {
public ReferenceContainer deleteAllReferences(final String wordHash) {
final ReferenceContainer c = new ReferenceContainer(
return c;
public boolean removeReference(final String wordHash, final String urlHash) {
boolean removed = false;
removed = removed | (indexCache.removeReference(wordHash, urlHash));
removed = removed | (collections.removeReference(wordHash, urlHash));
return removed;
public int removeEntryMultiple(final Set<String> wordHashes, final String urlHash) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
int count = 0;
while (i.hasNext()) {
if (removeReference(, urlHash)) count++;
return count;
public int removeReferences(final String wordHash, final Set<String> urlHashes) {
int removed = 0;
removed += indexCache.removeReferences(wordHash, urlHashes);
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
public String removeEntriesExpl(final String wordHash, final Set<String> urlHashes) {
String removed = "";
removed += indexCache.removeReferences(wordHash, urlHashes) + ", ";
removed += collections.removeReferences(wordHash, urlHashes);
return removed;
public void removeEntriesMultiple(final Set<String> wordHashes, final Set<String> urlHashes) {
// remove the same url hashes for multiple words
// this is mainly used when correcting a index after a search
final Iterator<String> i = wordHashes.iterator();
while (i.hasNext()) {
removeReferences(, urlHashes);
public int removeWordReferences(final Set<String> words, final String urlhash) {
// sequentially delete all word references
// returns number of deletions
final Iterator<String> iter = words.iterator();
int count = 0;
while (iter.hasNext()) {
// delete the URL reference in this word index
if (removeReference(Word.word2hash(, urlhash)) count++;
return count;
public synchronized TreeSet<ReferenceContainer> indexContainerSet(final String startHash, final boolean ram, final boolean rot, int count) {
// creates a set of indexContainers
// this does not use the cache
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startHash, 0));
final TreeSet<ReferenceContainer> containers = new TreeSet<ReferenceContainer>(containerOrder);
final Iterator<ReferenceContainer> i = referenceIterator(startHash, rot, ram);
if (ram) count = Math.min(indexCache.size(), count);
ReferenceContainer container;
// this loop does not terminate using the i.hasNex() predicate when rot == true
// because then the underlying iterator is a rotating iterator without termination
// in this case a termination must be ensured with a counter
// It must also be ensured that the counter is in/decreased every loop
while ((count > 0) && (i.hasNext())) {
container =;
if ((container != null) && (container.size() > 0)) {
count--; // decrease counter even if the container was null or empty to ensure termination
return containers; // this may return less containers as demanded
public synchronized CloneableIterator<ReferenceContainer> referenceIterator(final String startHash, final boolean rot, final boolean ram) {
final CloneableIterator<ReferenceContainer> i = wordContainers(startHash, ram);
if (rot) {
return new RotateIterator<ReferenceContainer>(i, new String(, indexCache.size() + ((ram) ? 0 : collections.size()));
return i;
private synchronized CloneableIterator<ReferenceContainer> wordContainers(final String startWordHash, final boolean ram) {
final Order<ReferenceContainer> containerOrder = new ReferenceContainerOrder(indexOrder.clone());
containerOrder.rotate(emptyContainer(startWordHash, 0));
if (ram) {
return indexCache.referenceIterator(startWordHash, false, true);
return collections.referenceIterator(startWordHash, false, false);
return new MergeIterator<ReferenceContainer>(
indexCache.referenceIterator(startWordHash, false, true),
collections.referenceIterator(startWordHash, false, false),
public int countReferences(String key) {
return indexCache.countReferences(key) + collections.countReferences(key);
Reference in new issue