You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
539 lines
20 KiB
539 lines
20 KiB
// MapDataMining.java
|
|
// -----------------------
|
|
// (C) 29.01.2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
// first published 2004 as part of kelondroMap on http://www.anomic.de
|
|
//
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
//
|
|
// $LastChangedDate$
|
|
// $LastChangedRevision$
|
|
// $LastChangedBy$
|
|
//
|
|
// LICENSE
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package net.yacy.kelondro.blob;
|
|
|
|
import java.io.File;
|
|
import java.io.IOException;
|
|
import java.lang.reflect.Array;
|
|
import java.time.DateTimeException;
|
|
import java.time.LocalDateTime;
|
|
import java.time.ZoneOffset;
|
|
import java.time.ZonedDateTime;
|
|
import java.util.Collection;
|
|
import java.util.HashMap;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
import net.yacy.cora.date.GenericFormatter;
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
import net.yacy.cora.order.Base64Order;
|
|
import net.yacy.cora.order.ByteOrder;
|
|
import net.yacy.cora.order.CloneableIterator;
|
|
import net.yacy.cora.sorting.ConcurrentScoreMap;
|
|
import net.yacy.cora.sorting.ScoreMap;
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
import net.yacy.cora.util.SpaceExceededException;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
|
|
|
|
public class MapDataMining extends MapHeap {
|
|
|
|
private final static Long LONG0 = Long.valueOf(0);
|
|
private final static Float FLOAT0 = Float.valueOf(0.0f);
|
|
|
|
private final String[] sortfields, longaccfields, floataccfields;
|
|
private Map<String, ScoreMap<String>> sortClusterMap; // a String-kelondroMScoreCluster - relation
|
|
private Map<String, Long> accLong; // to store accumulations of Long cells
|
|
private Map<String, Float> accFloat; // to store accumulations of Float cells
|
|
private final MapColumnIndex columnIndex; // to store fast select-where indexes
|
|
|
|
@SuppressWarnings("unchecked")
|
|
public MapDataMining(final File heapFile,
|
|
final int keylength,
|
|
final ByteOrder ordering,
|
|
final int buffermax,
|
|
final int cachesize,
|
|
final String[] sortfields,
|
|
final String[] longaccfields,
|
|
final String[] floataccfields) throws IOException {
|
|
super(heapFile, keylength, ordering, buffermax, cachesize, ' ');
|
|
|
|
// create fast ordering clusters and acc fields
|
|
this.sortfields = sortfields;
|
|
this.longaccfields = longaccfields;
|
|
this.floataccfields = floataccfields;
|
|
|
|
this.columnIndex = new MapColumnIndex();
|
|
|
|
ScoreMap<String>[] cluster = null;
|
|
if (sortfields == null) this.sortClusterMap = null; else {
|
|
this.sortClusterMap = new ConcurrentHashMap<String, ScoreMap<String>>();
|
|
cluster = (ScoreMap<String>[]) Array.newInstance(ScoreMap.class, sortfields.length);
|
|
for (int i = 0; i < sortfields.length; i++) {
|
|
cluster[i] = new ConcurrentScoreMap<String>();
|
|
}
|
|
}
|
|
|
|
Long[] longaccumulator = null;
|
|
Float[] floataccumulator = null;
|
|
if (longaccfields == null) {
|
|
this.accLong = null;
|
|
} else {
|
|
this.accLong = new ConcurrentHashMap<String, Long>();
|
|
longaccumulator = new Long[longaccfields.length];
|
|
for (int i = 0; i < longaccfields.length; i++) {
|
|
longaccumulator[i] = LONG0;
|
|
}
|
|
}
|
|
if (floataccfields == null) {
|
|
this.accFloat = null;
|
|
} else {
|
|
this.accFloat = new ConcurrentHashMap<String, Float>();
|
|
floataccumulator = new Float[floataccfields.length];
|
|
for (int i = 0; i < floataccfields.length; i++) {
|
|
floataccumulator[i] = FLOAT0;
|
|
}
|
|
}
|
|
|
|
// fill cluster and accumulator with values
|
|
if (sortfields != null || longaccfields != null || floataccfields != null) try {
|
|
final CloneableIterator<byte[]> it = super.keys(true, false);
|
|
byte[] mapnameb;
|
|
String cell;
|
|
long valuel;
|
|
float valued;
|
|
Map<String, String> map;
|
|
while (it.hasNext()) {
|
|
mapnameb = it.next();
|
|
try {
|
|
map = super.get(mapnameb);
|
|
} catch (final SpaceExceededException e) {
|
|
ConcurrentLog.warn("MapDataMining", e.getMessage());
|
|
break;
|
|
}
|
|
if (map == null) break;
|
|
|
|
if (sortfields != null && cluster != null) {
|
|
for (int i = 0; i < sortfields.length; i++) {
|
|
cell = map.get(sortfields[i]);
|
|
if (cell != null) cluster[i].set(UTF8.String(mapnameb), object2score(cell));
|
|
}
|
|
}
|
|
|
|
if (longaccfields != null && longaccumulator != null) {
|
|
for (int i = 0; i < longaccfields.length; i++) {
|
|
cell = map.get(longaccfields[i]);
|
|
valuel = 0;
|
|
if (cell != null) try {
|
|
valuel = Long.parseLong(cell);
|
|
longaccumulator[i] = Long.valueOf(longaccumulator[i].longValue() + valuel);
|
|
} catch (final NumberFormatException e) {}
|
|
}
|
|
}
|
|
|
|
if (floataccfields != null && floataccumulator != null) {
|
|
for (int i = 0; i < floataccfields.length; i++) {
|
|
cell = map.get(floataccfields[i]);
|
|
valued = 0f;
|
|
if (cell != null) try {
|
|
valued = Float.parseFloat(cell);
|
|
floataccumulator[i] = new Float(floataccumulator[i].floatValue() + valued);
|
|
} catch (final NumberFormatException e) {}
|
|
}
|
|
}
|
|
}
|
|
} catch (final IOException e) {}
|
|
|
|
// fill cluster
|
|
if (sortfields != null && cluster != null) {
|
|
for (int i = 0; i < sortfields.length; i++) this.sortClusterMap.put(sortfields[i], cluster[i]);
|
|
}
|
|
|
|
// fill acc map
|
|
if (longaccfields != null && longaccumulator != null) {
|
|
for (int i = 0; i < longaccfields.length; i++) this.accLong.put(longaccfields[i], longaccumulator[i]);
|
|
}
|
|
if (floataccfields != null && floataccumulator != null) {
|
|
for (int i = 0; i < floataccfields.length; i++) this.accFloat.put(floataccfields[i], floataccumulator[i]);
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public synchronized void clear() {
|
|
super.clear();
|
|
if (this.sortfields == null) this.sortClusterMap = null; else {
|
|
this.sortClusterMap = new HashMap<String, ScoreMap<String>>();
|
|
for (final String sortfield : this.sortfields) {
|
|
this.sortClusterMap.put(sortfield, new ConcurrentScoreMap<String>());
|
|
}
|
|
}
|
|
|
|
if (this.longaccfields == null) {
|
|
this.accLong = null;
|
|
} else {
|
|
this.accLong = new HashMap<String, Long>();
|
|
for (final String longaccfield : this.longaccfields) {
|
|
this.accLong.put(longaccfield, LONG0);
|
|
}
|
|
}
|
|
if (this.floataccfields == null) {
|
|
this.accFloat = null;
|
|
} else {
|
|
this.accFloat = new HashMap<String, Float>();
|
|
for (final String floataccfield : this.floataccfields) {
|
|
this.accFloat.put(floataccfield, FLOAT0);
|
|
}
|
|
}
|
|
|
|
this.columnIndex.clear();
|
|
}
|
|
|
|
@Override
|
|
public synchronized void insert(final byte[] key, final Map<String, String> newMap) throws IOException, SpaceExceededException {
|
|
assert (key != null);
|
|
assert (key.length > 0);
|
|
assert (newMap != null);
|
|
|
|
// update elementCount
|
|
if ((this.longaccfields != null) || (this.floataccfields != null)) {
|
|
final Map<String, String> oldMap = super.get(key, false);
|
|
if (oldMap != null) {
|
|
// element exists, update acc
|
|
updateAcc(oldMap, false);
|
|
}
|
|
|
|
// update accumulators with new values (add)
|
|
updateAcc(newMap, true);
|
|
}
|
|
|
|
super.insert(key, newMap);
|
|
|
|
// update sortCluster
|
|
if (this.sortClusterMap != null) updateSortCluster(UTF8.String(key), newMap);
|
|
|
|
this.columnIndex.update(key, newMap);
|
|
}
|
|
|
|
private void updateAcc(final Map<String, String> map, final boolean add) {
|
|
String value;
|
|
long valuel;
|
|
float valued;
|
|
Long longaccumulator;
|
|
Float floataccumulator;
|
|
if (this.longaccfields != null) {
|
|
for (final String longaccfield : this.longaccfields) {
|
|
value = map.get(longaccfield);
|
|
if (value != null) {
|
|
try {
|
|
valuel = Long.parseLong(value);
|
|
longaccumulator = this.accLong.get(longaccfield);
|
|
if (add) {
|
|
this.accLong.put(longaccfield, Long.valueOf(longaccumulator.longValue() + valuel));
|
|
} else {
|
|
this.accLong.put(longaccfield, Long.valueOf(longaccumulator.longValue() - valuel));
|
|
}
|
|
} catch (final NumberFormatException e) {}
|
|
}
|
|
}
|
|
}
|
|
if (this.floataccfields != null) {
|
|
for (final String floataccfield : this.floataccfields) {
|
|
value = map.get(floataccfield);
|
|
if (value != null) {
|
|
try {
|
|
valued = Float.parseFloat(value);
|
|
floataccumulator = this.accFloat.get(floataccfield);
|
|
if (add) {
|
|
this.accFloat.put(floataccfield, Float.valueOf(floataccumulator.floatValue() + valued));
|
|
} else {
|
|
this.accFloat.put(floataccfield, Float.valueOf(floataccumulator.floatValue() - valued));
|
|
}
|
|
} catch (final NumberFormatException e) {}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void updateSortCluster(final String key, final Map<String, String> map) {
|
|
Object cell;
|
|
ScoreMap<String> cluster;
|
|
for (final String sortfield : this.sortfields) {
|
|
cell = map.get(sortfield);
|
|
if (cell != null) {
|
|
cluster = this.sortClusterMap.get(sortfield);
|
|
cluster.set(key, object2score(cell));
|
|
this.sortClusterMap.put(sortfield, cluster);
|
|
}
|
|
}
|
|
}
|
|
|
|
@Override
|
|
public synchronized void delete(final byte[] key) throws IOException {
|
|
if (key == null) return;
|
|
|
|
// update elementCount
|
|
if (this.sortfields != null || this.longaccfields != null || this.floataccfields != null) {
|
|
Map<String, String> map;
|
|
try {
|
|
map = super.get(key, false);
|
|
if (map != null) {
|
|
|
|
// update accumulators (subtract)
|
|
if (this.longaccfields != null || this.floataccfields != null) updateAcc(map, false);
|
|
|
|
// remove from sortCluster
|
|
if (this.sortfields != null) deleteSortCluster(UTF8.String(key));
|
|
}
|
|
} catch (final SpaceExceededException e) {
|
|
map = null;
|
|
ConcurrentLog.logException(e);
|
|
}
|
|
}
|
|
super.delete(key);
|
|
|
|
this.columnIndex.delete(key);
|
|
}
|
|
|
|
private void deleteSortCluster(final String key) {
|
|
if (key == null) return;
|
|
ScoreMap<String> cluster;
|
|
for (final String sortfield : this.sortfields) {
|
|
cluster = this.sortClusterMap.get(sortfield);
|
|
cluster.delete(key);
|
|
this.sortClusterMap.put(sortfield, cluster);
|
|
}
|
|
}
|
|
|
|
private synchronized Iterator<byte[]> keys(final boolean up, /* sorted by */ final String field) {
|
|
// sorted iteration using the sortClusters
|
|
if (this.sortClusterMap == null) return null;
|
|
final ScoreMap<String> cluster = this.sortClusterMap.get(field);
|
|
if (cluster == null) return null; // sort field does not exist
|
|
//System.out.println("DEBUG: cluster for field " + field + ": " + cluster.toString());
|
|
return new string2bytearrayIterator(cluster.keys(up));
|
|
}
|
|
|
|
private synchronized Iterator<byte[]> keys() throws IOException {
|
|
return super.keys(true, null);
|
|
}
|
|
|
|
private static class string2bytearrayIterator implements Iterator<byte[]> {
|
|
|
|
private final Iterator<String> s;
|
|
|
|
private string2bytearrayIterator(final Iterator<String> s) {
|
|
this.s = s;
|
|
}
|
|
|
|
@Override
|
|
public boolean hasNext() {
|
|
return this.s.hasNext();
|
|
}
|
|
|
|
@Override
|
|
public byte[] next() {
|
|
final String r = this.s.next();
|
|
if (r == null) return null;
|
|
return UTF8.getBytes(r);
|
|
}
|
|
|
|
@Override
|
|
public void remove() {
|
|
this.s.remove();
|
|
}
|
|
|
|
}
|
|
|
|
public synchronized Collection<byte[]> select(final String whereKey, final String isValue) throws IOException {
|
|
Collection<byte[]> idx = null;
|
|
try {
|
|
idx = this.columnIndex.getIndex(whereKey, isValue);
|
|
} catch (final UnsupportedOperationException e) {
|
|
this.columnIndex.init(whereKey, isValue, new FullMapIterator(keys()));
|
|
try {
|
|
idx = this.columnIndex.getIndex(whereKey, isValue);
|
|
} catch (final UnsupportedOperationException ee) {
|
|
throw ee;
|
|
}
|
|
}
|
|
return idx;
|
|
}
|
|
|
|
public synchronized Iterator<Map.Entry<byte[], Map<String, String>>> entries(final boolean up, final String field) {
|
|
return new FullMapIterator(keys(up, field));
|
|
}
|
|
|
|
public synchronized long getLongAcc(final String field) {
|
|
final Long accumulator = this.accLong.get(field);
|
|
if (accumulator == null) return -1;
|
|
return accumulator.longValue();
|
|
}
|
|
|
|
public synchronized float getFloatAcc(final String field) {
|
|
final Float accumulator = this.accFloat.get(field);
|
|
if (accumulator == null) return -1;
|
|
return accumulator.floatValue();
|
|
}
|
|
|
|
@Override
|
|
public int size() {
|
|
return super.size();
|
|
}
|
|
|
|
@Override
|
|
public boolean isEmpty() {
|
|
return super.isEmpty();
|
|
}
|
|
|
|
@Override
|
|
public synchronized void close() {
|
|
// close cluster
|
|
if (this.sortClusterMap != null) {
|
|
for (final String sortfield : this.sortfields)
|
|
this.sortClusterMap.remove(sortfield);
|
|
this.sortClusterMap = null;
|
|
}
|
|
|
|
super.close();
|
|
}
|
|
|
|
private static final long minutemillis = 60000;
|
|
private static long date2000 = 0;
|
|
|
|
static {
|
|
try {
|
|
date2000 = ZonedDateTime.of(2000, 1, 1, 0, 0, 0, 0, ZoneOffset.UTC).toInstant().toEpochMilli();
|
|
} catch (final DateTimeException e) {}
|
|
}
|
|
|
|
private static final byte[] plainByteArray = new byte[256];
|
|
static {
|
|
for (int i = 0; i < 32; i++) plainByteArray[i] = (byte) i;
|
|
for (int i = 32; i < 96; i++) plainByteArray[i] = (byte) (i - 32);
|
|
for (int i = 96; i < 128; i++) plainByteArray[i] = (byte) (i - 64);
|
|
for (int i = 128; i < 256; i++) plainByteArray[i] = (byte) (i & 0X20);
|
|
}
|
|
|
|
private static int object2score(Object o) {
|
|
if (o instanceof Integer) return ((Integer) o).intValue();
|
|
if (o instanceof Long) {
|
|
final long l = ((Long) o).longValue();
|
|
if (l < Integer.MAX_VALUE) return (int) l;
|
|
return (int) (l & Integer.MAX_VALUE);
|
|
}
|
|
if (o instanceof Float) {
|
|
final double d = 1000f * ((Float) o).floatValue();
|
|
return (int) Math.round(d);
|
|
}
|
|
if (o instanceof Double) {
|
|
final double d = 1000d * ((Double) o).doubleValue();
|
|
return (int) Math.round(d);
|
|
}
|
|
String s = null;
|
|
if (o instanceof String) s = (String) o;
|
|
if (o instanceof byte[]) s = UTF8.String((byte[]) o);
|
|
|
|
// this can be used to calculate a score from a string
|
|
if (s == null || s.isEmpty() || s.charAt(0) == '-') return 0;
|
|
try {
|
|
long l = 0;
|
|
if (s.length() == GenericFormatter.PATTERN_SHORT_SECOND.length()) {
|
|
// try a date
|
|
l = ((LocalDateTime.parse(s, GenericFormatter.FORMAT_SHORT_SECOND).toInstant(ZoneOffset.UTC)
|
|
.toEpochMilli() - date2000) / minutemillis);
|
|
if (l < 0) l = 0;
|
|
} else {
|
|
// try a number
|
|
l = Long.parseLong(s);
|
|
}
|
|
// fix out-of-ranges
|
|
if (l > Integer.MAX_VALUE) return (int) (l & Integer.MAX_VALUE);
|
|
if (l < 0) {
|
|
System.out.println("string2score: negative score for input " + s);
|
|
return 0;
|
|
}
|
|
return (int) l;
|
|
} catch (final Throwable e) {
|
|
// try it lex
|
|
int len = s.length();
|
|
if (len > 5) len = 5;
|
|
int c = 0;
|
|
for (int i = 0; i < len; i++) {
|
|
c <<= 6;
|
|
c += plainByteArray[(byte) s.charAt(i)];
|
|
}
|
|
for (int i = len; i < 5; i++) c <<= 6;
|
|
if (c < 0) {
|
|
System.out.println("string2score: negative score for input " + s);
|
|
return 0;
|
|
}
|
|
return c;
|
|
}
|
|
}
|
|
|
|
/*
|
|
public byte[] lookupBy(
|
|
final String whereKey,
|
|
final String isValue
|
|
) {
|
|
|
|
|
|
}
|
|
*/
|
|
|
|
public static void main(final String[] args) {
|
|
try {
|
|
File f = new File("/tmp/MapDataMinig.test.db");
|
|
f.delete();
|
|
final MapDataMining db = new MapDataMining(f, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, new String[] {"X"}, new String[] {"X"}, new String[] {});
|
|
final Map<String, String> m1 = new HashMap<String, String>();
|
|
long t = System.currentTimeMillis();
|
|
m1.put("X", Long.toString(t));
|
|
db.put("abcdefghijk1".getBytes(), m1);
|
|
final Map<String, String> m2 = new HashMap<String, String>();
|
|
m2.put("X", Long.toString(t - 1000));
|
|
db.put("abcdefghijk2".getBytes(), m2);
|
|
final Map<String, String> m3 = new HashMap<String, String>();
|
|
m3.put("X", Long.toString(t + 2000));
|
|
db.put("abcdefghijk3".getBytes(), m3);
|
|
|
|
// iterate the keys, sorted by field X in ascending order (must be: abcdefghijk2 - abcdefghijk1 - abcdefghijk3)
|
|
final Iterator<byte[]> i1 = db.keys(true, "X");
|
|
byte[] k;
|
|
while (i1.hasNext()) {
|
|
k = i1.next();
|
|
System.out.println(new String(k));
|
|
}
|
|
|
|
// iterate the maps, sorted by field X in descending order (must be: abcdefghijk3 - abcdefghijk1 - abcdefghijk2)
|
|
final Iterator<Map.Entry<byte[], Map<String, String>>> i2 = db.entries(false, "X");
|
|
Map.Entry<byte[], Map<String, String>> e;
|
|
while (i2.hasNext()) {
|
|
e = i2.next();
|
|
System.out.println(UTF8.String(e.getKey()) + ":" + e.getValue());
|
|
}
|
|
|
|
System.exit(0);
|
|
} catch (final IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
}
|
|
}
|