You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
430 lines
16 KiB
430 lines
16 KiB
// ReferenceContainerArray.java
|
|
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 04.01.2009 on http://yacy.net
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.kelondro.rwi;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.util.Date;
|
|
import java.util.Iterator;
|
|
|
|
import net.yacy.cora.order.ByteOrder;
|
|
import net.yacy.cora.order.CloneableIterator;
|
|
import net.yacy.cora.sorting.Rating;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.kelondro.blob.ArrayStack;
|
|
import net.yacy.kelondro.blob.BLOB;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.Row;
|
|
import net.yacy.kelondro.index.RowSet;
|
|
|
|
|
|
public final class ReferenceContainerArray<ReferenceType extends Reference> {
|
|
|
|
private final static long METHOD_MAXRUNTIME = 5000L;
|
|
|
|
private final ReferenceFactory<ReferenceType> factory;
|
|
private final ArrayStack array;
|
|
|
|
/**
|
|
* open a index container array based on BLOB dumps. The content of the BLOBs will not be read
|
|
* unless a .idx file exists. Only the .idx file is opened to get a fast read access to
|
|
* the BLOB. This class provides no write methods, because BLOB files should not be
|
|
* written in random access. To support deletion, a write access to the BLOB for deletion
|
|
* is still possible
|
|
* @param payloadrow the row definition for the BLOB data structure
|
|
* @param log
|
|
* @throws IOException
|
|
*/
|
|
public ReferenceContainerArray(
|
|
final File heapLocation,
|
|
final String prefix,
|
|
final ReferenceFactory<ReferenceType> factory,
|
|
final ByteOrder termOrder,
|
|
final int termSize) throws IOException {
|
|
this.factory = factory;
|
|
this.array = new ArrayStack(
|
|
heapLocation,
|
|
prefix,
|
|
termOrder,
|
|
termSize,
|
|
0,
|
|
true,
|
|
true);
|
|
}
|
|
|
|
public synchronized void close() {
|
|
this.array.close(true);
|
|
}
|
|
|
|
public void clear() throws IOException {
|
|
this.array.clear();
|
|
}
|
|
|
|
public int[] sizes() {
|
|
return (this.array == null) ? new int[0] : this.array.sizes();
|
|
}
|
|
|
|
public ByteOrder ordering() {
|
|
return this.array.ordering();
|
|
}
|
|
|
|
public File newContainerBLOBFile() {
|
|
return this.array.newBLOB(new Date());
|
|
}
|
|
|
|
public void mountBLOBFile(final File location) throws IOException {
|
|
this.array.mountBLOB(location, false);
|
|
}
|
|
|
|
public Row rowdef() {
|
|
return this.factory.getRow();
|
|
}
|
|
|
|
/**
|
|
* return an iterator object that creates top-level-clones of the indexContainers
|
|
* in the cache, so that manipulations of the iterated objects do not change
|
|
* objects in the cache.
|
|
* @throws IOException
|
|
*/
|
|
public CloneableIterator<ReferenceContainer<ReferenceType>> referenceContainerIterator(final byte[] startWordHash, final boolean rot, final boolean excludePrivate) {
|
|
try {
|
|
return new ReferenceContainerIterator(startWordHash, rot, excludePrivate);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public class ReferenceContainerIterator implements CloneableIterator<ReferenceContainer<ReferenceType>>, Iterable<ReferenceContainer<ReferenceType>> {
|
|
|
|
// this class exists, because the wCache cannot be iterated with rotation
|
|
// and because every indexContainer Object that is iterated must be returned as top-level-clone
|
|
// so this class simulates wCache.tailMap(startWordHash).values().iterator()
|
|
// plus the mentioned features
|
|
|
|
private final boolean rot, excludePrivate;
|
|
protected CloneableIterator<byte[]> iterator;
|
|
|
|
public ReferenceContainerIterator(final byte[] startWordHash, final boolean rot, final boolean excludePrivate) throws IOException {
|
|
this.rot = rot;
|
|
this.excludePrivate = excludePrivate;
|
|
this.iterator = ReferenceContainerArray.this.array.keys(true, startWordHash);
|
|
// The collection's iterator will return the values in the order that their corresponding keys appear in the tree.
|
|
}
|
|
|
|
@Override
|
|
public ReferenceContainerIterator clone(final Object secondWordHash) {
|
|
try {
|
|
return new ReferenceContainerIterator((byte[]) secondWordHash, this.rot, this.excludePrivate);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
if (this.iterator == null) return false;
|
|
if (this.rot) return true;
|
|
return this.iterator.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public ReferenceContainer<ReferenceType> next() {
|
|
while (this.iterator.hasNext()) try {
|
|
byte[] b = this.iterator.next();
|
|
if (this.excludePrivate && Word.isPrivate(b)) continue;
|
|
return get(b);
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
// rotation iteration
|
|
if (!this.rot) {
|
|
return null;
|
|
}
|
|
try {
|
|
this.iterator = ReferenceContainerArray.this.array.keys(true, null);
|
|
while (this.iterator.hasNext()) {
|
|
byte[] b = this.iterator.next();
|
|
if (this.excludePrivate && Word.isPrivate(b)) continue;
|
|
return get(b);
|
|
}
|
|
return null;
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public void remove() {
|
|
this.iterator.remove();
|
|
}
|
|
|
|
@Override
|
|
public Iterator<ReferenceContainer<ReferenceType>> iterator() {
|
|
return this;
|
|
}
|
|
|
|
@Override
|
|
public void close() {
|
|
this.iterator.close();
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* return an iterator object that counts the number of references in indexContainers
|
|
* the startWordHash may be null to iterate all from the beginning
|
|
* @throws IOException
|
|
*/
|
|
public CloneableIterator<Rating<byte[]>> referenceCountIterator(final byte[] startWordHash, final boolean rot, final boolean excludePrivate) {
|
|
try {
|
|
return new ReferenceCountIterator(startWordHash, rot, excludePrivate);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public class ReferenceCountIterator implements CloneableIterator<Rating<byte[]>>, Iterable<Rating<byte[]>> {
|
|
|
|
private final boolean rot, excludePrivate;
|
|
private CloneableIterator<byte[]> iterator;
|
|
|
|
public ReferenceCountIterator(final byte[] startWordHash, final boolean rot, final boolean excludePrivate) throws IOException {
|
|
this.rot = rot;
|
|
this.excludePrivate = excludePrivate;
|
|
this.iterator = ReferenceContainerArray.this.array.keys(true, startWordHash);
|
|
// The collection's iterator will return the values in the order that their corresponding keys appear in the tree.
|
|
}
|
|
|
|
@Override
|
|
public ReferenceCountIterator clone(final Object secondWordHash) {
|
|
try {
|
|
return new ReferenceCountIterator((byte[]) secondWordHash, this.rot, this.excludePrivate);
|
|
} catch (final IOException e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
if (this.iterator == null) return false;
|
|
if (this.rot) return true;
|
|
return this.iterator.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public Rating<byte[]> next() {
|
|
byte[] reference;
|
|
while (this.iterator.hasNext()) try {
|
|
reference = this.iterator.next();
|
|
if (this.excludePrivate && Word.isPrivate(reference)) continue;
|
|
return new Rating<byte[]>(reference, count(reference));
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
// rotation iteration
|
|
if (!this.rot) {
|
|
return null;
|
|
}
|
|
while (this.iterator.hasNext()) try {
|
|
this.iterator = ReferenceContainerArray.this.array.keys(true, null);
|
|
reference = this.iterator.next();
|
|
if (this.excludePrivate && Word.isPrivate(reference)) continue;
|
|
return new Rating<byte[]>(reference, count(reference));
|
|
} catch (final Throwable e) {
|
|
ConcurrentLog.logException(e);
|
|
return null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
@Override
|
|
public void remove() {
|
|
this.iterator.remove();
|
|
}
|
|
|
|
@Override
|
|
public Iterator<Rating<byte[]>> iterator() {
|
|
return this;
|
|
}
|
|
|
|
@Override
|
|
public void close() {
|
|
this.iterator.close();
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* test if a given key is in the heap
|
|
* this works with heaps in write- and read-mode
|
|
* @param key
|
|
* @return true, if the key is used in the heap; false otherwise
|
|
* @throws IOException
|
|
*/
|
|
public boolean has(final byte[] termHash) {
|
|
return this.array.containsKey(termHash);
|
|
}
|
|
|
|
/**
|
|
* get a indexContainer from a heap
|
|
* @param key
|
|
* @return the indexContainer if one exist, null otherwise
|
|
* @throws IOException
|
|
* @throws SpaceExceededException
|
|
*/
|
|
public ReferenceContainer<ReferenceType> get(final byte[] termHash) throws IOException, SpaceExceededException {
|
|
final long timeout = System.currentTimeMillis() + METHOD_MAXRUNTIME;
|
|
final Iterator<byte[]> entries = this.array.getAll(termHash).iterator();
|
|
if (entries == null || !entries.hasNext()) return null;
|
|
final byte[] a = entries.next();
|
|
int k = 1;
|
|
ReferenceContainer<ReferenceType> c = new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(a, this.factory.getRow()));
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("ReferenceContainerArray", "timout in get() (1): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
|
|
return c;
|
|
}
|
|
while (entries.hasNext()) {
|
|
c = c.merge(new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(entries.next(), this.factory.getRow())));
|
|
k++;
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("ReferenceContainerArray", "timout in get() (2): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
|
|
return c;
|
|
}
|
|
}
|
|
return c;
|
|
}
|
|
|
|
public int count(final byte[] termHash) throws IOException {
|
|
final long timeout = System.currentTimeMillis() + METHOD_MAXRUNTIME;
|
|
final Iterator<Long> entries = this.array.lengthAll(termHash).iterator();
|
|
if (entries == null || !entries.hasNext()) return 0;
|
|
final Long a = entries.next();
|
|
int k = 1;
|
|
int c = RowSet.importRowCount(a, this.factory.getRow());
|
|
assert c >= 0;
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("ReferenceContainerArray", "timout in count() (1): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
|
|
return c;
|
|
}
|
|
while (entries.hasNext()) {
|
|
c += RowSet.importRowCount(entries.next(), this.factory.getRow());
|
|
assert c >= 0;
|
|
k++;
|
|
if (System.currentTimeMillis() > timeout) {
|
|
ConcurrentLog.warn("ReferenceContainerArray", "timout in count() (2): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
|
|
return c;
|
|
}
|
|
}
|
|
assert c >= 0;
|
|
return c;
|
|
}
|
|
|
|
/**
|
|
* delete a indexContainer from the heap cache. This can only be used for write-enabled heaps
|
|
* @param wordHash
|
|
* @return the indexContainer if the cache contained the container, null otherwise
|
|
* @throws IOException
|
|
*/
|
|
public void delete(final byte[] termHash) throws IOException {
|
|
// returns the index that had been deleted
|
|
this.array.delete(termHash);
|
|
}
|
|
|
|
public int reduce(final byte[] termHash, final ContainerReducer<ReferenceType> reducer) throws IOException, SpaceExceededException {
|
|
return this.array.reduce(termHash, new BLOBReducer(termHash, reducer));
|
|
}
|
|
|
|
public class BLOBReducer implements BLOB.Reducer {
|
|
|
|
ContainerReducer<ReferenceType> rewriter;
|
|
byte[] wordHash;
|
|
|
|
public BLOBReducer(final byte[] wordHash, final ContainerReducer<ReferenceType> rewriter) {
|
|
this.rewriter = rewriter;
|
|
this.wordHash = wordHash;
|
|
}
|
|
|
|
@Override
|
|
public byte[] rewrite(final byte[] b) throws SpaceExceededException {
|
|
if (b == null) return null;
|
|
final ReferenceContainer<ReferenceType> c = this.rewriter.reduce(new ReferenceContainer<ReferenceType>(ReferenceContainerArray.this.factory, this.wordHash, RowSet.importRowSet(b, ReferenceContainerArray.this.factory.getRow())));
|
|
if (c == null) return null;
|
|
final byte bb[] = c.exportCollection();
|
|
assert bb.length <= b.length;
|
|
return bb;
|
|
}
|
|
}
|
|
|
|
public interface ContainerReducer<ReferenceType extends Reference> {
|
|
|
|
public ReferenceContainer<ReferenceType> reduce(ReferenceContainer<ReferenceType> container);
|
|
|
|
}
|
|
|
|
public int entries() {
|
|
return this.array.entries();
|
|
}
|
|
|
|
public boolean shrinkBestSmallFiles(final IODispatcher merger, final long targetFileSize) {
|
|
final File[] ff = this.array.unmountBestMatch(2.0f, targetFileSize);
|
|
if (ff == null) return false;
|
|
ConcurrentLog.info("RICELL-shrink1", "unmountBestMatch(2.0, " + targetFileSize + ")");
|
|
merger.merge(ff[0], ff[1], this.factory, this.array, newContainerBLOBFile());
|
|
return true;
|
|
}
|
|
|
|
public boolean shrinkAnySmallFiles(final IODispatcher merger, final long targetFileSize) {
|
|
final File[] ff = this.array.unmountSmallest(targetFileSize);
|
|
if (ff == null) return false;
|
|
ConcurrentLog.info("RICELL-shrink2", "unmountSmallest(" + targetFileSize + ")");
|
|
merger.merge(ff[0], ff[1], this.factory, this.array, newContainerBLOBFile());
|
|
return true;
|
|
}
|
|
|
|
public boolean shrinkUpToMaxSizeFiles(final IODispatcher merger, final long maxFileSize) {
|
|
final File[] ff = this.array.unmountBestMatch(2.0f, maxFileSize);
|
|
if (ff == null) return false;
|
|
ConcurrentLog.info("RICELL-shrink3", "unmountBestMatch(2.0, " + maxFileSize + ")");
|
|
merger.merge(ff[0], ff[1], this.factory, this.array, newContainerBLOBFile());
|
|
return true;
|
|
}
|
|
|
|
public boolean shrinkOldFiles(final IODispatcher merger) {
|
|
final File ff = this.array.unmountOldest();
|
|
if (ff == null) return false;
|
|
ConcurrentLog.info("RICELL-shrink4/rewrite", "unmountOldest()");
|
|
merger.merge(ff, null, this.factory, this.array, newContainerBLOBFile());
|
|
return true;
|
|
}
|
|
}
|