diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 14f69ca3e..2edf1590f 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -53,12 +53,13 @@ import java.util.zip.GZIPOutputStream; import de.anomic.kelondro.index.HandleSet; import de.anomic.kelondro.index.IntegerHandleIndex; import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.text.IndexCollection; import de.anomic.kelondro.text.MetadataRepository; +import de.anomic.kelondro.text.ReferenceContainerArray; import de.anomic.kelondro.text.MetadataRepository.Export; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.MemoryControl; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.yacy.yacyURL; public class URLAnalysis { @@ -393,12 +394,11 @@ public class URLAnalysis { System.out.println("finished"); } - public static void incollection(String collectionPath, String statisticPath) { + public static void incell(File cellPath, String statisticPath) { try { - IntegerHandleIndex idx = IndexCollection.referenceHashes( - new File(collectionPath), - "collection", - 12, + IntegerHandleIndex idx = ReferenceContainerArray.referenceHashes( + cellPath, + plasmaWordIndex.wordReferenceFactory, Base64Order.enhancedCoder, WordReferenceRow.urlEntryRow); System.out.println("COLLECTION INDEX REFERENCE COLLECTION starting dump of statistics"); @@ -475,11 +475,11 @@ public class URLAnalysis { } else if (args[0].equals("-sort") && args.length >= 2) { // generate file .x.sort with sorted lists and split the file in smaller pieces for (int i = 1; i < args.length; i++) sortsplit(args[i]); - } else if (args[0].equals("-incollection") && args.length >= 2) { - // generate a dump of all referenced URL hashes from a given RICOLLECTION + } else if (args[0].equals("-incell") && args.length >= 2) { + // generate a dump of all referenced URL hashes from a given RICELL // example: - // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump - incollection(args[1], args[2]); + // java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump + incell(new File(args[1]), args[2]); } else if (args[0].equals("-diffurlcol") && args.length >= 3) { // make a diff-file that contains hashes from the url database that do not occur in the collection reference dump // example: diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 7d3c3f388..f091abcbd 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -346,5 +346,5 @@ public final class IndexCell extends AbstractBu this.maxRamEntries = maxWords; this.cleanCache(); } - + } diff --git a/source/de/anomic/kelondro/text/IndexCollection.java b/source/de/anomic/kelondro/text/IndexCollection.java deleted file mode 100644 index 5dbadc51c..000000000 --- a/source/de/anomic/kelondro/text/IndexCollection.java +++ /dev/null @@ -1,1046 +0,0 @@ -// iIndexCollection.java -// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 03.07.2006 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro.text; - -import java.io.File; -import java.io.IOException; -import java.text.SimpleDateFormat; -import java.util.ArrayList; -import java.util.Date; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Random; -import java.util.Set; -import java.util.TimeZone; - -import de.anomic.kelondro.index.IntegerHandleIndex; -import de.anomic.kelondro.index.ObjectIndex; -import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.index.RowCollection; -import de.anomic.kelondro.index.RowSet; -import de.anomic.kelondro.index.Row.EntryIndex; -import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.order.ByteOrder; -import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.order.Digest; -import de.anomic.kelondro.order.RotateIterator; -import de.anomic.kelondro.table.EcoTable; -import de.anomic.kelondro.table.FixedWidthArray; -import de.anomic.kelondro.table.FlexTable; -import de.anomic.kelondro.util.FileUtils; -import de.anomic.kelondro.util.MemoryControl; -import de.anomic.kelondro.util.kelondroException; -import de.anomic.kelondro.util.kelondroOutOfLimitsException; -import de.anomic.kelondro.util.Log; -import de.anomic.yacy.yacyURL; - -public class IndexCollection extends AbstractIndex { - - private static final int loadfactor = 4; - private static final int serialNumber = 0; - private static final long minimumRAM4Eco = 20 * 1024 * 1024; - private static final int EcoFSBufferSize = 1000; - private static final int errorLimit = 500; // if the index exceeds this number of errors, it is re-built next time the application starts - - private ObjectIndex index; - private final int keylength; - private final File path; - private final String filenameStub; - private final File commonsPath; - private final Row payloadrow; // definition of the payload (chunks inside the collections) - private final int maxPartitions; // this is the maxmimum number of array files - private int indexErrors; // counter for exceptions when index returned wrong value - private Map arrays; // Map of (partitionNumber"-"chunksize)/kelondroFixedWidthArray - Objects - - private static final int idx_col_key = 0; // the index - private static final int idx_col_chunksize = 1; // chunksize (number of bytes in a single chunk, needed for migration option) - private static final int idx_col_chunkcount = 2; // chunkcount (number of chunks in this collection) - private static final int idx_col_clusteridx = 3; // selector for right cluster file, must be >= arrayIndex(chunkcount) - private static final int idx_col_flags = 4; // flags (for future use) - private static final int idx_col_indexpos = 5; // indexpos (position in array file) - private static final int idx_col_lastread = 6; // a time stamp, update time in days since 1.1.2000 - private static final int idx_col_lastwrote = 7; // a time stamp, update time in days since 1.1.2000 - - public IndexCollection( - final File path, - final String filenameStub, - final ReferenceFactory factory, - final int keyLength, - final ByteOrder wordOrder, - final int maxpartitions, - final Row payloadrow, - boolean useCommons) throws IOException { - super(factory); - - // the buffersize is number of bytes that are only used if the kelondroFlexTable is backed up with a kelondroTree - indexErrors = 0; - this.path = path; - this.filenameStub = filenameStub; - this.keylength = keyLength; - this.payloadrow = payloadrow; - this.maxPartitions = maxpartitions; - File cop = new File(path, filenameStub + "." + fillZ(Integer.toHexString(payloadrow.objectsize).toUpperCase(), 4) + ".commons"); - this.commonsPath = (useCommons) ? cop : null; - if (this.commonsPath == null) { - FileUtils.deletedelete(cop); - } else { - this.commonsPath.mkdirs(); - } - final File f = new File(path, filenameStub + ".index"); - if (f.isDirectory()) { - FlexTable.delete(path, filenameStub + ".index"); - } - if (f.exists()) { - Log.logFine("COLLECTION INDEX STARTUP", "OPENING COLLECTION INDEX"); - - // open index and array files - this.arrays = new HashMap(); // all entries will be dynamically created with getArray() - index = openIndexFile(path, this.keylength, filenameStub, wordOrder, loadfactor, payloadrow, 0); - openAllArrayFiles(false, wordOrder); - } else { - // calculate initialSpace - final String[] list = this.path.list(); - FixedWidthArray array; - int initialSpace = 0; - for (int i = 0; i < list.length; i++) if (list[i].endsWith(".kca")) { - // open array - final int pos = list[i].indexOf('.'); - if (pos < 0) continue; - final int partitionNumber = Integer.parseInt(list[i].substring(pos + 9, pos + 11), 16); - final int serialNumber = Integer.parseInt(list[i].substring(pos + 12, pos + 14), 16); - try { - array = openArrayFile(this.path, this.filenameStub, this.keylength, partitionNumber, serialNumber, wordOrder, this.payloadrow.objectsize, true); - initialSpace += array.size(); - array.close(); - } catch (final IOException e) { - e.printStackTrace(); - continue; - } - } - Log.logFine("COLLECTION INDEX STARTUP", "STARTED INITIALIZATION OF NEW COLLECTION INDEX WITH " + initialSpace + " ENTRIES. THIS WILL TAKE SOME TIME. " + (MemoryControl.available() / 1024 / 1024) + "MB AVAILABLE."); - final Row indexRowdef = indexRow(keyLength, wordOrder); - final long necessaryRAM4fullTable = minimumRAM4Eco + (indexRowdef.objectsize + 4) * initialSpace * 3 / 2; - - // initialize (new generation) index table from file - index = new EcoTable(f, indexRowdef, (MemoryControl.request(necessaryRAM4fullTable, false)) ? EcoTable.tailCacheUsageAuto : EcoTable.tailCacheDenyUsage, EcoFSBufferSize, initialSpace); - - // open array files - this.arrays = new HashMap(); // all entries will be dynamically created with getArray() - openAllArrayFiles(true, wordOrder); - Log.logFine("COLLECTION INDEX STARTUP", "FINISHED INITIALIZATION OF NEW COLLECTION INDEX."); - } - } - - public ByteOrder ordering() { - return index.row().objectOrder; - } - - public synchronized CloneableIterator> references(final byte[] startWordHash, final boolean rot) { - return new wordContainersIterator(startWordHash, rot); - } - - public class wordContainersIterator implements CloneableIterator> { - - private final Iterator wci; - private final boolean rot; - - public wordContainersIterator(final byte[] startWordHash, final boolean rot) { - this.rot = rot; - this.wci = keycollections(startWordHash, Base64Order.zero(startWordHash.length), rot); - } - - public wordContainersIterator clone(final Object secondWordHash) { - return new wordContainersIterator((byte[]) secondWordHash, rot); - } - - public boolean hasNext() { - return wci.hasNext(); - } - - public ReferenceContainer next() { - final Object[] oo = wci.next(); - if (oo == null) return null; - final byte[] key = (byte[]) oo[0]; - final RowSet collection = (RowSet) oo[1]; - if (collection == null) return null; - return new ReferenceContainer(factory, key, collection); - } - - public void remove() { - wci.remove(); - } - - } - - public ReferenceContainer get(final byte[] wordHash, final Set urlselection) { - try { - final RowSet collection = this.get(wordHash); - if (collection != null) collection.select(urlselection); - if ((collection == null) || (collection.size() == 0)) return null; - return new ReferenceContainer(factory, wordHash, collection); - } catch (final IOException e) { - return null; - } - } - - public ReferenceContainer delete(final byte[] wordHash) { - try { - // find an entry, if one exists - final Row.Entry indexrow = index.remove(wordHash); - if (indexrow == null) return null; - final RowSet removedCollection = getdelete(indexrow, true); - assert (removedCollection != null); - if (removedCollection == null) return null; - return new ReferenceContainer(factory, wordHash, removedCollection); - } catch (final IOException e) { - return null; - } - } - - public boolean remove(final byte[] wordHash, final String urlHash) { - final HashSet hs = new HashSet(); - hs.add(urlHash); - try { - return remove(wordHash, hs) == 1; - } catch (kelondroOutOfLimitsException e) { - e.printStackTrace(); - return false; - } - } - - public void add(final ReferenceContainer newEntries) { - if (newEntries == null) return; - try { - this.merge(newEntries); - } catch (final kelondroOutOfLimitsException e) { - e.printStackTrace(); - } catch (final IOException e) { - e.printStackTrace(); - } - } - - public void add(byte[] wordhash, ReferenceType entry) { - if (entry == null) return; - try { - ReferenceContainer container = new ReferenceContainer(factory, wordhash, this.payloadrow, 1); - container.add(entry); - this.merge(container); - } catch (final kelondroOutOfLimitsException e) { - e.printStackTrace(); - } catch (final IOException e) { - e.printStackTrace(); - } - } - - public int count(byte[] key) { - try { - final RowSet collection = this.get(key); - if (collection == null) return 0; - return collection.size(); - } catch (final IOException e) { - return 0; - } - } - - //---------------------------------------------------------------------------------- - - - private static Row indexRow(final int keylength, final ByteOrder payloadOrder) { - return new Row( - "byte[] key-" + keylength + "," + - "int chunksize-4 {b256}," + - "int chunkcount-4 {b256}," + - "byte clusteridx-1 {b256}," + - "byte flags-1 {b256}," + - "int indexpos-4 {b256}," + - "short lastread-2 {b256}, " + - "short lastwrote-2 {b256}", - payloadOrder - ); - } - - public Row payloadRow() { - return this.payloadrow; - } - - private static String fillZ(String s, final int len) { - while (s.length() < len) s = "0" + s; - return s; - } - - private static File arrayFile(final File path, final String filenameStub, final int loadfactor, final int chunksize, final int partitionNumber, final int serialNumber) { - final String lf = fillZ(Integer.toHexString(loadfactor).toUpperCase(), 2); - final String cs = fillZ(Integer.toHexString(chunksize).toUpperCase(), 4); - final String pn = fillZ(Integer.toHexString(partitionNumber).toUpperCase(), 2); - final String sn = fillZ(Integer.toHexString(serialNumber).toUpperCase(), 2); - return new File(path, filenameStub + "." + lf + "." + cs + "." + pn + "." + sn + ".kca"); // kelondro collection array - } - - public void clear() throws IOException { - index.clear(); - for (final FixedWidthArray array: arrays.values()) { - array.clear(); - } - } - - public void deleteIndexOnExit() { - // will be rebuilt on next start - this.index.deleteOnExit(); - } - - private void openAllArrayFiles(final boolean indexGeneration, final ByteOrder wordOrder) throws IOException { - - final String[] list = this.path.list(); - FixedWidthArray array; - - final Row irow = indexRow(keylength, wordOrder); - final int t = RowCollection.daysSince2000(System.currentTimeMillis()); - for (int i = 0; i < list.length; i++) if (list[i].endsWith(".kca")) { - - // open array - final int pos = list[i].indexOf('.'); - if (pos < 0) continue; - final int chunksize = Integer.parseInt(list[i].substring(pos + 4, pos + 8), 16); - final int partitionNumber = Integer.parseInt(list[i].substring(pos + 9, pos + 11), 16); - final int serialNumber = Integer.parseInt(list[i].substring(pos + 12, pos + 14), 16); - try { - array = openArrayFile(this.path, this.filenameStub, this.keylength, partitionNumber, serialNumber, wordOrder, this.payloadrow.objectsize, true); - } catch (final IOException e) { - e.printStackTrace(); - continue; - } - - // remember that we opened the array - arrays.put(partitionNumber + "-" + chunksize, array); - - if ((index != null) && (indexGeneration)) { - // loop over all elements in array and create index entry for each row - Row.EntryIndex aentry; - Row.Entry ientry; - final Iterator ei = array.contentRows(10000); - byte[] key; - final long start = System.currentTimeMillis(); - long lastlog = start; - int count = 0; - int chunkcount; - while (ei.hasNext()) { - aentry = ei.next(); - key = aentry.getColBytes(0); - assert (key != null); - if (key == null) continue; // skip deleted entries - chunkcount = RowCollection.sizeOfExportedCollectionRows(aentry, 1); - assert chunkcount > 0; - if (chunkcount == 0) continue; - ientry = irow.newEntry(); - ientry.setCol(idx_col_key, key); - ientry.setCol(idx_col_chunksize, chunksize); - ientry.setCol(idx_col_chunkcount, chunkcount); - ientry.setCol(idx_col_clusteridx, (byte) partitionNumber); - ientry.setCol(idx_col_flags, (byte) 0); - ientry.setCol(idx_col_indexpos, aentry.index()); - ientry.setCol(idx_col_lastread, t); - ientry.setCol(idx_col_lastwrote, t); - index.addUnique(ientry); // FIXME: this should avoid doubles - count++; - - // write a log - if (System.currentTimeMillis() - lastlog > 30000) { - Log.logInfo("COLLECTION INDEX STARTUP", "created " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); - lastlog = System.currentTimeMillis(); - } - } - } - } - // care for double entries - int partition, maxpartition; - Row.Entry maxentry; - int doublecount = 0; - ArrayList doubles = index.removeDoubles(); - if (doubles.size() > 0) Log.logWarning("COLLECTION INDEX STARTUP", "found " + doubles.size() + " doubles in collections, removing them in arrays"); - for (final RowCollection doubleset: doubles) { - // for each entry in doubleset choose one which we want to keep - maxentry = null; - maxpartition = -1; - for (Row.Entry entry: doubleset) { - partition = (int) entry.getColLong(idx_col_clusteridx); - if (partition > maxpartition) { - maxpartition = partition; - maxentry = entry; - } - } - if (maxentry != null) { - // put back a single entry to the index, which is then not double to any other entry - index.put(maxentry); - doublecount++; - } - } - if (doublecount > 0) Log.logWarning("STARTUP", "found " + doublecount + " RWI entries with references to several collections. All have been fixed (zombies still exists)."); - } - - /** - * enumerate all index files and return a set of reference hashes - * @param path - * @param filenameStub - * @param keylength - * @param wordOrder - * @param payloadrow - * @return - * @throws IOException - */ - public static IntegerHandleIndex referenceHashes( - final File path, - final String filenameStub, - final int keylength, - final ByteOrder wordOrder, - final Row payloadrow) throws IOException { - - final String[] list = path.list(); - FixedWidthArray array; - System.out.println("COLLECTION INDEX REFERENCE COLLECTION startup"); - IntegerHandleIndex references = new IntegerHandleIndex(keylength, wordOrder, 0, 1000000); - for (int i = 0; i < list.length; i++) if (list[i].endsWith(".kca")) { - // open array - final int pos = list[i].indexOf('.'); - if (pos < 0) continue; - final int partitionNumber = Integer.parseInt(list[i].substring(pos + 9, pos + 11), 16); - final int serialNumber = Integer.parseInt(list[i].substring(pos + 12, pos + 14), 16); - System.out.println("COLLECTION INDEX REFERENCE COLLECTION opening partition " + partitionNumber + ", " + i + " of " + list.length); - try { - array = openArrayFile(path, filenameStub, keylength, partitionNumber, serialNumber, wordOrder, payloadrow.objectsize, true); - } catch (final IOException e) { - e.printStackTrace(); - continue; - } - System.out.println("COLLECTION INDEX REFERENCE COLLECTION opened partition " + partitionNumber + ", initializing iterator"); - // loop over all elements in array and collect reference hashes - Row.EntryIndex arrayrow; - final Iterator ei = array.contentRows(10000); - System.out.println("COLLECTION INDEX REFERENCE COLLECTION opened partition " + partitionNumber + ", starting reference scanning"); - final long start = System.currentTimeMillis(); - long lastlog = start - 27000; - int count = 0; - while (ei.hasNext()) { - arrayrow = ei.next(); - if (arrayrow == null) continue; - final RowSet collection = new RowSet(payloadrow, arrayrow); - final int chunkcountInArray = collection.size(); - for (int j = 0; j < chunkcountInArray; j++) { - references.inc(collection.get(j, false).getColBytes(0), 1); - } - count++; - // write a log - if (System.currentTimeMillis() - lastlog > 30000) { - System.out.println("COLLECTION INDEX REFERENCE COLLECTION scanned " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); - //Log.logInfo("COLLECTION INDEX REFERENCE COLLECTION", "scanned " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); - lastlog = System.currentTimeMillis(); - } - } - } - System.out.println("COLLECTION INDEX REFERENCE COLLECTION finished with reference collection"); - return references; - } - - private static ObjectIndex openIndexFile( - final File path, int keylength, final String filenameStub, final ByteOrder indexOrder, - final int loadfactor, final Row rowdef, final int initialSpace) throws IOException { - // open/create index table - final File f = new File(path, filenameStub + ".index"); - final Row indexRowdef = indexRow(keylength, indexOrder); - ObjectIndex theindex; - if (f.isDirectory()) { - FlexTable.delete(path, filenameStub + ".index"); - } - // open a ecotable - final long records = f.length() / indexRowdef.objectsize; - final long necessaryRAM4fullTable = minimumRAM4Eco + (indexRowdef.objectsize + 4) * records * 3 / 2; - final boolean fullCache = MemoryControl.request(necessaryRAM4fullTable, false); - if (fullCache) { - theindex = new EcoTable(f, indexRowdef, EcoTable.tailCacheUsageAuto, EcoFSBufferSize, initialSpace); - //if (!((kelondroEcoTable) theindex).usesFullCopy()) theindex = new kelondroCache(theindex); - } else { - //theindex = new kelondroCache(new kelondroEcoTable(f, indexRowdef, kelondroEcoTable.tailCacheDenyUsage, EcoFSBufferSize, initialSpace)); - theindex = new EcoTable(f, indexRowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, initialSpace); - } - return theindex; - } - - private static FixedWidthArray openArrayFile( - File path, String filenameStub, int keylength, - final int partitionNumber, final int serialNumber, - final ByteOrder wordOrder, int objectsize, - final boolean create) throws IOException { - final File f = arrayFile(path, filenameStub, loadfactor, objectsize, partitionNumber, serialNumber); - final int load = arrayCapacity(partitionNumber); - final Row rowdef = new Row( - "byte[] key-" + keylength + "," + - "byte[] collection-" + (RowCollection.exportOverheadSize + load * objectsize), - wordOrder ); - if ((!(f.exists())) && (!create)) return null; - final FixedWidthArray a = new FixedWidthArray(f, rowdef, 0); - Log.logFine("STARTUP", "opened array file " + f + " with " + a.size() + " RWIs"); - return a; - } - - private FixedWidthArray getArray(final int partitionNumber, final int serialNumber, final ByteOrder wordOrder, final int chunksize) { - final String accessKey = partitionNumber + "-" + chunksize; - FixedWidthArray array = arrays.get(accessKey); - if (array != null) return array; - try { - array = openArrayFile(this.path, this.filenameStub, this.keylength, partitionNumber, serialNumber, wordOrder, this.payloadrow.objectsize, true); - } catch (final IOException e) { - e.printStackTrace(); - return null; - } - arrays.put(accessKey, array); - return array; - } - - private static int arrayCapacity(final int arrayCounter) { - if (arrayCounter < 0) return 0; - int load = loadfactor; - for (int i = 0; i < arrayCounter; i++) load = load * loadfactor; - return load; - } - - private static int arrayIndex(final int requestedCapacity) throws kelondroOutOfLimitsException{ - // the requestedCapacity is the number of wanted chunks - int load = 1, i = 0; - while (true) { - load = load * loadfactor; - if (load >= requestedCapacity) return i; - i++; - } - } - - public int size() { - return index.size(); - } - - public int minMem() { - // calculate a minimum amount of memory that is necessary to use the collection - // during runtime (after the index was initialized) - - // caclculate an upper limit (not the correct size) of the maximum number of indexes for a wordHash - // this is computed by the size of the biggest used collection - // this must be multiplied with the payload size - // and doubled for necessary memory transformation during sort operation - return (int) (arrayCapacity(arrays.size() - 1) * this.payloadrow.objectsize * RowCollection.growfactor); - } - - private void array_remove( - final int oldPartitionNumber, final int serialNumber, final int chunkSize, - final int oldRownumber) throws IOException { - // we need a new slot, that means we must first delete the old entry - // find array file - final FixedWidthArray array = getArray(oldPartitionNumber, serialNumber, index.row().objectOrder, chunkSize); - - // delete old entry - array.remove(oldRownumber); - } - - private Row.Entry array_new( - final byte[] key, final RowCollection collection) throws IOException { - // the collection is new - final int partitionNumber = arrayIndex(collection.size()); - final Row.Entry indexrow = index.row().newEntry(); - final FixedWidthArray array = getArray(partitionNumber, serialNumber, index.row().objectOrder, this.payloadrow.objectsize); - - // define row - final Row.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // write a new entry in this array - try { - final int newRowNumber = array.add(arrayEntry); - - // store the new row number in the index - indexrow.setCol(idx_col_key, key); - indexrow.setCol(idx_col_chunksize, this.payloadrow.objectsize); - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_flags, (byte) 0); - indexrow.setCol(idx_col_indexpos, newRowNumber); - indexrow.setCol(idx_col_lastread, RowCollection.daysSince2000(System.currentTimeMillis())); - indexrow.setCol(idx_col_lastwrote, RowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be an index.addUnique(indexrow); - return indexrow; - } catch (Exception e) { - // the index appears to be corrupted at a particular point - Log.logWarning("kelondroCollectionIndex", "array " + arrayFile(this.path, this.filenameStub, loadfactor, this.payloadrow.objectsize, partitionNumber, serialNumber).toString() + " has errors \"" + e.getMessage() + "\" (error #" + indexErrors + ")"); - return null; - } - } - - private void array_add( - final byte[] key, final RowCollection collection, final Row.Entry indexrow, - final int partitionNumber, final int serialNumber, final int chunkSize) throws IOException { - - // write a new entry in the other array - final FixedWidthArray array = getArray(partitionNumber, serialNumber, index.row().objectOrder, chunkSize); - - // define new row - final Row.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // write a new entry in this array - final int rowNumber = array.add(arrayEntry); - - // store the new row number in the index - indexrow.setCol(idx_col_chunkcount, collection.size()); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_indexpos, rowNumber); - indexrow.setCol(idx_col_lastwrote, RowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be a index.put(indexrow); - } - - private void array_replace( - final byte[] key, final RowCollection collection, final Row.Entry indexrow, - final int partitionNumber, final int serialNumber, final int chunkSize, - final int rowNumber) throws IOException { - // we don't need a new slot, just write collection into the old one - - // find array file - final FixedWidthArray array = getArray(partitionNumber, serialNumber, index.row().objectOrder, chunkSize); - - // define new row - final Row.Entry arrayEntry = array.row().newEntry(); - arrayEntry.setCol(0, key); - arrayEntry.setCol(1, collection.exportCollection()); - - // overwrite entry in this array - array.set(rowNumber, arrayEntry); - - // update the index entry - final int collectionsize = collection.size(); // extra variable for easier debugging - indexrow.setCol(idx_col_chunkcount, collectionsize); - indexrow.setCol(idx_col_clusteridx, (byte) partitionNumber); - indexrow.setCol(idx_col_lastwrote, RowCollection.daysSince2000(System.currentTimeMillis())); - - // after calling this method there must be a index.put(indexrow); - } - - public synchronized void put(final byte[] key, final RowCollection collection) throws IOException, kelondroOutOfLimitsException { - assert (key != null); - assert (collection != null); - assert (collection.size() != 0); - - // first find an old entry, if one exists - Row.Entry indexrow = index.get(key); - - if (indexrow == null) { - // create new row and index entry - if ((collection != null) && (collection.size() > 0)) { - indexrow = array_new(key, collection); // modifies indexrow - if (indexrow != null) index.addUnique(indexrow); - } - return; - } - - // overwrite the old collection - // read old information - //int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - final int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - final int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - final int oldPartitionNumber = indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); - - final int newPartitionNumber = arrayIndex(collection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, collection, indexrow, - oldPartitionNumber, serialNumber, this.payloadrow.objectsize, - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, serialNumber, this.payloadrow.objectsize, - oldrownumber); - array_add( - key, collection, indexrow, - newPartitionNumber, serialNumber, this.payloadrow.objectsize); // modifies indexrow - } - - if ((int) indexrow.getColLong(idx_col_chunkcount) != collection.size()) { - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - Log.logSevere("kelondroCollectionIndex", "UPDATE (put) ERROR: array has different chunkcount than index after merge: index = " + (int) indexrow.getColLong(idx_col_chunkcount) + ", collection.size() = " + collection.size() + " (error #" + indexErrors + ")"); - } - index.put(indexrow); // write modified indexrow - } - - private synchronized void merge(final ReferenceContainer container) throws IOException, kelondroOutOfLimitsException { - if ((container == null) || (container.size() == 0)) return; - final byte[] key = container.getTermHash(); - - // first find an old entry, if one exists - Row.Entry indexrow = index.get(key); - if (indexrow == null) { - indexrow = array_new(key, container); // modifies indexrow - if (indexrow != null) index.addUnique(indexrow); // write modified indexrow - } else { - // merge with the old collection - // attention! this modifies the indexrow entry which must be written with index.put(indexrow) afterwards! - RowSet collection = container; - - // read old information - final int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - final int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - final int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - final int oldPartitionNumber = indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)) : "oldPartitionNumber = " + oldPartitionNumber + ", arrayIndex(oldchunkcount) = " + arrayIndex(oldchunkcount); - final int oldSerialNumber = 0; - - // load the old collection and join it - try { - RowSet krc = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, oldSerialNumber, false); - //System.out.println("***DEBUG kelondroCollectionIndex.merge before merge*** krc.size = " + krc.size() + ", krc.sortbound = " + krc.sortBound + ", collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound); - collection = collection.merge(krc); - //System.out.println("***DEBUG kelondroCollectionIndex.merge after merge*** collection.size = " + collection.size() + ", collection.sortbound = " + collection.sortBound); - - } catch (kelondroException e) { - // an error like "array does not contain expected row" may appear here. Just go on like if the collection does not exist - e.printStackTrace(); - } - collection.trim(false); - - // check for size of collection: - // if necessary shrink the collection and dump a part of that collection - // to avoid that this grows too big - if (arrayIndex(collection.size()) > maxPartitions) { - shrinkCollection(key, collection, arrayCapacity(maxPartitions)); - } - - // determine new partition location - final int newPartitionNumber = arrayIndex(collection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, collection, indexrow, - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize, - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, oldSerialNumber, this.payloadrow.objectsize, - oldrownumber); - array_add( - key, collection, indexrow, - newPartitionNumber, oldSerialNumber, this.payloadrow.objectsize); // modifies indexrow - } - - final int collectionsize = collection.size(); // extra variable for easier debugging - final int indexrowcount = (int) indexrow.getColLong(idx_col_chunkcount); - if (indexrowcount != collectionsize) { - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - Log.logSevere("kelondroCollectionIndex", "UPDATE (merge) ERROR: array has different chunkcount than index after merge: index = " + indexrowcount + ", collection.size() = " + collectionsize + " (error #" + indexErrors + ")"); - } - index.put(indexrow); // write modified indexrow - } - } - - private void shrinkCollection(final byte[] key, final RowCollection collection, final int targetSize) { - // removes entries from collection - // the removed entries are stored in a 'commons' dump file - - if (key.length != 12) return; - // check if the collection is already small enough - final int oldsize = collection.size(); - if (oldsize <= targetSize) return; - final RowSet newcommon = new RowSet(collection.rowdef, 0); - - // delete some entries, which are bad rated - Iterator i = collection.iterator(); - Row.Entry entry; - byte[] ref; - while (i.hasNext()) { - entry = i.next(); - ref = entry.getColBytes(0); - if ((ref.length != 12) || (!yacyURL.probablyRootURL(new String(ref)))) { - newcommon.addUnique(entry); - i.remove(); - } - } - final int firstnewcommon = newcommon.size(); - - // check if we shrinked enough - final Random rand = new Random(System.currentTimeMillis()); - while (collection.size() > targetSize) { - // now delete randomly more entries from the survival collection - i = collection.iterator(); - while (i.hasNext()) { - entry = i.next(); - ref = entry.getColBytes(0); - if (rand.nextInt() % 4 != 0) { - newcommon.addUnique(entry); - i.remove(); - } - } - } - collection.trim(false); - - Log.logInfo("kelondroCollectionIndex", "shrinked common word " + new String(key) + "; old size = " + oldsize + ", new size = " + collection.size() + ", maximum size = " + targetSize + ", newcommon size = " + newcommon.size() + ", first newcommon = " + firstnewcommon); - - // finally dump the removed entries to a file - if (commonsPath != null) { - newcommon.sort(); - final SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmmss"); - formatter.setTimeZone(TimeZone.getTimeZone("GMT")); - final String filename = Digest.encodeHex(Base64Order.enhancedCoder.decode(new String(key), "de.anomic.kelondro.kelondroCollectionIndex.shrinkCollection(...)")) + "_" + formatter.format(new Date()) + ".collection"; - final File storagePath = new File(commonsPath, filename.substring(0, 2)); // make a subpath - storagePath.mkdirs(); - final File file = new File(storagePath, filename); - try { - newcommon.saveCollection(file); - Log.logInfo("kelondroCollectionIndex", "dumped common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); - } catch (final IOException e) { - e.printStackTrace(); - Log.logWarning("kelondroCollectionIndex", "failed to dump common word " + new String(key) + " to " + file.toString() + "; size = " + newcommon.size()); - } - } - } - - public synchronized int remove(final byte[] key, final Set removekeys) { - - if ((removekeys == null) || (removekeys.size() == 0)) return 0; - - // first find an old entry, if one exists - Row.Entry indexrow; - try { - indexrow = index.get(key); - - - if (indexrow == null) return 0; - - // overwrite the old collection - // read old information - final int oldchunksize = (int) indexrow.getColLong(idx_col_chunksize); // needed only for migration - final int oldchunkcount = (int) indexrow.getColLong(idx_col_chunkcount); // the number if rows in the collection - final int oldrownumber = (int) indexrow.getColLong(idx_col_indexpos); // index of the entry in array - final int oldPartitionNumber = indexrow.getColByte(idx_col_clusteridx); // points to array file - assert (oldPartitionNumber >= arrayIndex(oldchunkcount)); - - int removed = 0; - assert (removekeys != null); - // load the old collection and remove keys - RowSet oldcollection = null; - try { - oldcollection = getwithparams(indexrow, oldchunksize, oldchunkcount, oldPartitionNumber, oldrownumber, serialNumber, false); - } catch (kelondroException e) { - // some internal problems occurred. however, go on like there was no old collection - e.printStackTrace(); - } - // the oldcollection may be null here! - - if (oldcollection != null && oldcollection.size() > 0) { - // remove the keys from the set - final Iterator i = removekeys.iterator(); - while (i.hasNext()) { - if (oldcollection.remove(i.next().getBytes()) != null) removed++; - } - oldcollection.sort(); - oldcollection.trim(false); - } - - // in case of an error or an empty collection, remove the collection: - if (oldcollection == null || oldcollection.size() == 0) { - // delete the index entry and the array - array_remove( - oldPartitionNumber, serialNumber, this.payloadrow.objectsize, - oldrownumber); - index.remove(key); - return removed; - } - - // now write to a new partition (which may be the same partition as the old one) - final int newPartitionNumber = arrayIndex(oldcollection.size()); - - // see if we need new space or if we can overwrite the old space - if (oldPartitionNumber == newPartitionNumber) { - array_replace( - key, oldcollection, indexrow, - oldPartitionNumber, serialNumber, this.payloadrow.objectsize, - oldrownumber); // modifies indexrow - } else { - array_remove( - oldPartitionNumber, serialNumber, this.payloadrow.objectsize, - oldrownumber); - array_add( - key, oldcollection, indexrow, - newPartitionNumber, serialNumber, this.payloadrow.objectsize); // modifies indexrow - } - index.put(indexrow); // write modified indexrow - return removed; - } catch (IOException e1) { - e1.printStackTrace(); - } - return 0; - } - - public synchronized boolean has(final byte[] key) { - return index.has(key); - } - - private synchronized RowSet get(final byte[] key) throws IOException { - // find an entry, if one exists - final Row.Entry indexrow = index.get(key); - if (indexrow == null) return null; - final RowSet col = getdelete(indexrow, false); - assert (col != null); - return col; - } - - - private RowSet getdelete(final Row.Entry indexrow, final boolean remove) throws IOException { - // call this only within a synchronized(index) environment - - // read values - final int chunksize = (int) indexrow.getColLong(idx_col_chunksize); - final int chunkcount = (int) indexrow.getColLong(idx_col_chunkcount); - final int rownumber = (int) indexrow.getColLong(idx_col_indexpos); - final int partitionnumber = indexrow.getColByte(idx_col_clusteridx); - assert(partitionnumber >= arrayIndex(chunkcount)) : "partitionnumber = " + partitionnumber + ", arrayIndex(chunkcount) = " + arrayIndex(chunkcount); - final int serialnumber = 0; - - return getwithparams(indexrow, chunksize, chunkcount, partitionnumber, rownumber, serialnumber, remove); - } - - private synchronized RowSet getwithparams(final Row.Entry indexrow, final int chunksize, final int chunkcount, final int clusteridx, final int rownumber, final int serialnumber, final boolean remove) throws IOException { - // open array entry - final FixedWidthArray array = getArray(clusteridx, serialnumber, index.row().objectOrder, chunksize); - final Row.Entry arrayrow = array.get(rownumber); - if (arrayrow == null) { - // the index appears to be corrupted - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - Log.logWarning("kelondroCollectionIndex", "array " + arrayFile(this.path, this.filenameStub, loadfactor, chunksize, clusteridx, serialnumber).toString() + " does not contain expected row (error #" + indexErrors + ")"); - return new RowSet(this.payloadrow, 0); - } - - // read the row and define a collection - final byte[] indexkey = indexrow.getColBytes(idx_col_key); - final byte[] arraykey = arrayrow.getColBytes(0); - if (!(index.row().objectOrder.wellformed(arraykey))) { - // cleanup for a bad bug that corrupted the database - index.remove(indexkey); // the RowCollection must be considered lost - array.remove(rownumber); // loose the RowCollection (we don't know how much is lost) - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - Log.logSevere("kelondroCollectionIndex." + array.filename, "lost a RowCollection because of a bad arraykey (error #" + indexErrors + ")"); - return new RowSet(this.payloadrow, 0); - } - - final RowSet collection = new RowSet(this.payloadrow, arrayrow); // FIXME: this does not yet work with different rowdef in case of several rowdef.objectsize() - if ((!(index.row().objectOrder.wellformed(indexkey))) || (!index.row().objectOrder.equal(arraykey, indexkey))) { - // check if we got the right row; this row is wrong. Fix it: - index.remove(indexkey); // the wrong row cannot be fixed - // store the row number in the index; this may be a double-entry, but better than nothing - final Row.Entry indexEntry = index.row().newEntry(); - indexEntry.setCol(idx_col_key, arrayrow.getColBytes(0)); - indexEntry.setCol(idx_col_chunksize, this.payloadrow.objectsize); - indexEntry.setCol(idx_col_chunkcount, collection.size()); - indexEntry.setCol(idx_col_clusteridx, (byte) clusteridx); - indexEntry.setCol(idx_col_flags, (byte) 0); - indexEntry.setCol(idx_col_indexpos, rownumber); - indexEntry.setCol(idx_col_lastread, RowCollection.daysSince2000(System.currentTimeMillis())); - indexEntry.setCol(idx_col_lastwrote, RowCollection.daysSince2000(System.currentTimeMillis())); - index.put(indexEntry); - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - Log.logSevere("kelondroCollectionIndex." + array.filename, "array contains wrong row '" + new String(arrayrow.getColBytes(0)) + "', expected is '" + new String(indexrow.getColBytes(idx_col_key)) + "', the row has been fixed (error #" + indexErrors + ")"); - } - final int chunkcountInArray = collection.size(); - if (chunkcountInArray != chunkcount) { - // fix the entry in index - indexrow.setCol(idx_col_chunkcount, chunkcountInArray); - index.put(indexrow); - this.indexErrors++; - if (this.indexErrors == errorLimit) deleteIndexOnExit(); // delete index on exit for rebuild - array.logFailure("INCONSISTENCY (get) in " + arrayFile(this.path, this.filenameStub, loadfactor, chunksize, clusteridx, serialnumber).toString() + ": array has different chunkcount than index: index = " + chunkcount + ", array = " + chunkcountInArray + "; the index has been auto-fixed (error #" + indexErrors + ")"); - } - if (remove) array.remove(rownumber); // index is removed in calling method - return collection; - } - - protected synchronized Iterator keycollections(final byte[] startKey, final byte[] secondKey, final boolean rot) { - // returns an iteration of {byte[], kelondroRowSet} Objects - try { - return new keycollectionIterator(startKey, secondKey, rot); - } catch (final IOException e) { - e.printStackTrace(); - return null; - } - } - - public class keycollectionIterator implements Iterator { - - Iterator indexRowIterator; - - public keycollectionIterator(final byte[] startKey, final byte[] secondKey, final boolean rot) throws IOException { - // iterator of {byte[], kelondroRowSet} Objects - final CloneableIterator i = index.rows(true, startKey); - indexRowIterator = (rot) ? new RotateIterator(i, secondKey, index.size()) : i; - } - - public boolean hasNext() { - return indexRowIterator.hasNext(); - } - - public Object[] next() { - final Row.Entry indexrow = indexRowIterator.next(); - assert (indexrow != null); - if (indexrow == null) return null; - try { - return new Object[]{indexrow.getColBytes(0), getdelete(indexrow, false)}; - } catch (final Exception e) { - e.printStackTrace(); - return null; - } - } - - public void remove() { - indexRowIterator.remove(); - } - - } - - public synchronized void close() { - this.index.close(); - this.index = null; - final Iterator i = arrays.values().iterator(); - while (i.hasNext()) i.next().close(); - this.arrays = null; - } - -} diff --git a/source/de/anomic/kelondro/text/IndexCollectionMigration.java b/source/de/anomic/kelondro/text/IndexCollectionMigration.java deleted file mode 100644 index d496edbd6..000000000 --- a/source/de/anomic/kelondro/text/IndexCollectionMigration.java +++ /dev/null @@ -1,345 +0,0 @@ -// IndexCollectionMigration.java -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 30.03.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2009-03-13 11:34:51 +0100 (Fr, 13 Mrz 2009) $ -// $LastChangedRevision: 5709 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.kelondro.text; - -import java.io.File; -import java.io.IOException; -import java.util.Set; - -import de.anomic.kelondro.index.Row; -import de.anomic.kelondro.order.Base64Order; -import de.anomic.kelondro.order.ByteOrder; -import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.order.MergeIterator; -import de.anomic.kelondro.order.Order; -import de.anomic.kelondro.order.RotateIterator; -import de.anomic.kelondro.text.Index; -import de.anomic.kelondro.text.IndexCollection; -import de.anomic.kelondro.text.ReferenceContainer; -import de.anomic.kelondro.text.ReferenceContainerOrder; -import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; -import de.anomic.kelondro.util.FileUtils; -import de.anomic.kelondro.util.Log; - -public final class IndexCollectionMigration extends AbstractBufferedIndex implements Index, BufferedIndex { - - private final IndexCell cell; - private IndexCollection collections; - private final IODispatcher merger; - - public IndexCollectionMigration ( - final File indexPrimaryTextLocation, - final ReferenceFactory factory, - final ByteOrder wordOrdering, - final Row payloadrow, - final int entityCacheMaxSize, - final long targetFileSize, - final long maxFileSize, - final IODispatcher merger, - final int writeBufferSize, - final Log log) throws IOException { - super(factory); - - this.merger = merger; - final File celldir = new File(indexPrimaryTextLocation, "RICELL"); - this.cell = new IndexCell( - celldir, - factory, - wordOrdering, - WordReferenceRow.urlEntryRow, - entityCacheMaxSize, - targetFileSize, - maxFileSize, - this.merger, - writeBufferSize); - final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE"); - if (textindexcache.exists()) { - // migrate the "index.dhtout.blob" into RICELL directory - File f = new File(textindexcache, "index.dhtout.blob"); - if (f.exists()) { - File n = this.cell.newContainerBLOBFile(); - f.renameTo(n); - this.cell.mountBLOBFile(n); - } - f = new File(textindexcache, "index.dhtin.blob"); - if (f.exists()) { - File n = this.cell.newContainerBLOBFile(); - f.renameTo(n); - this.cell.mountBLOBFile(n); - } - // delete everything else - String[] l = textindexcache.list(); - for (String s: l) { - f = new File(textindexcache, s); - FileUtils.deletedelete(f); - } - FileUtils.deletedelete(textindexcache); - } - - // open collections, this is for migration only. - final File textindexcollections = new File(indexPrimaryTextLocation, "RICOLLECTION"); - if (textindexcollections.exists()) { - this.collections = new IndexCollection( - textindexcollections, - "collection", - factory, - 12, - Base64Order.enhancedCoder, - 7, - WordReferenceRow.urlEntryRow, - false); - if (this.collections.size() == 0) { - // delete everything here - this.collections.close(); - this.collections = null; - String[] l = textindexcollections.list(); - File f; - for (String s: l) { - f = new File(textindexcollections, s); - FileUtils.deletedelete(f); - } - FileUtils.deletedelete(textindexcollections); - } - } else { - this.collections = null; - } - } - - /* methods for interface Index */ - - public void add(final ReferenceContainer entries) throws IOException { - assert (entries.row().objectsize == WordReferenceRow.urlEntryRow.objectsize); - - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(entries.getTermHash()); - if (e != null) { - e.merge(entries); - cell.add(e); - } else { - cell.add(entries); - } - } else { - cell.add(entries); - } - } - - public void add(final byte[] wordHash, final ReferenceType entry) throws IOException { - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) { - e.add(entry); - cell.add(e); - } else { - cell.add(wordHash, entry); - } - } else { - cell.add(wordHash, entry); - } - } - - public boolean has(final byte[] wordHash) { - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) { - try { - cell.add(e); - } catch (IOException e1) { - e1.printStackTrace(); - } - return true; - } else { - return cell.has(wordHash); - } - } else { - return cell.has(wordHash); - } - } - - public int count(byte[] wordHash) { - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) { - try { - cell.add(e); - } catch (IOException e1) { - e1.printStackTrace(); - } - return cell.count(wordHash); - } else { - return cell.count(wordHash); - } - } else { - return cell.count(wordHash); - } - } - - public ReferenceContainer get(final byte[] wordHash, final Set urlselection) throws IOException { - if (wordHash == null) { - // wrong input - return null; - } - - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) cell.add(e); - } - - return this.cell.get(wordHash, urlselection); - } - - public ReferenceContainer delete(final byte[] wordHash) throws IOException { - ReferenceContainer cc = cell.delete(wordHash); - if (cc == null) { - if (collections == null) return null; - return collections.delete(wordHash); - } else { - if (collections == null) return cc; - ReferenceContainer cd = collections.delete(wordHash); - if (cd == null) return cc; - return cc.merge(cd); - } - } - - public boolean remove(final byte[] wordHash, final String urlHash) throws IOException { - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) cell.add(e); - } - return cell.remove(wordHash, urlHash); - } - - public int remove(final byte[] wordHash, final Set urlHashes) throws IOException { - if (this.collections != null) { - ReferenceContainer e = this.collections.delete(wordHash); - if (e != null) cell.add(e); - } - return cell.remove(wordHash, urlHashes); - } - - public synchronized CloneableIterator> references(final byte[] startHash, final boolean rot, final boolean ram) throws IOException { - final CloneableIterator> i = wordContainers(startHash, ram); - if (rot) { - return new RotateIterator>(i, Base64Order.zero(startHash.length), cell.size() + ((ram) ? 0 : collections.size())); - } - return i; - } - - private synchronized CloneableIterator> wordContainers(final byte[] startWordHash, final boolean ram) throws IOException { - final Order> containerOrder = new ReferenceContainerOrder(factory, cell.ordering().clone()); - ReferenceContainer emptyContainer = ReferenceContainer.emptyContainer(factory, startWordHash, 0); - containerOrder.rotate(emptyContainer); - if (ram) { - return cell.references(startWordHash, true); - } - if (collections == null) return cell.references(startWordHash, false); - return new MergeIterator>( - cell.references(startWordHash, false), - collections.references(startWordHash, false), - containerOrder, - ReferenceContainer.containerMergeMethod, - true); - } - - public void clear() { - try { - cell.clear(); - } catch (IOException e1) { - e1.printStackTrace(); - } - if (collections != null) try { - collections.clear(); - } catch (IOException e) { - e.printStackTrace(); - } - } - - public void close() { - cell.close(); - if (collections != null) collections.close(); - } - - public int size() { - return (collections == null) ? cell.size() : java.lang.Math.max(collections.size(), cell.size()); - } - - public int minMem() { - return 1024*1024 /* indexing overhead */ + cell.minMem() + ((collections == null) ? 0 : collections.minMem()); - } - - - /* - * methods for cache management - */ - - public int getBufferMaxReferences() { - return cell.getBufferMaxReferences(); - } - - public long getBufferMinAge() { - return cell.getBufferMinAge(); - } - - public long getBufferMaxAge() { - return cell.getBufferMaxAge(); - } - - public long getBufferSizeBytes() { - return cell.getBufferSizeBytes(); - } - - public void setBufferMaxWordCount(final int maxWords) { - cell.setBufferMaxWordCount(maxWords); - } - - public int getBackendSize() { - return (collections == null) ? cell.getBackendSize() : collections.size(); - } - - public int getBufferSize() { - return cell.getBufferSize(); - } - - public ByteOrder ordering() { - return cell.ordering(); - } - - public CloneableIterator> references(byte[] startWordHash, boolean rot) { - final Order> containerOrder = new ReferenceContainerOrder(factory, this.cell.ordering().clone()); - if (this.collections == null) return this.cell.references(startWordHash, rot); - //else - return new MergeIterator>( - this.cell.references(startWordHash, false), - this.collections.references(startWordHash, false), - containerOrder, - ReferenceContainer.containerMergeMethod, - true); - } - - public void cleanupBuffer(int time) { - this.cell.cleanupBuffer(time); - } -} diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index e58049ac3..0f1a07e44 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -32,6 +32,7 @@ import java.util.List; import de.anomic.kelondro.blob.BLOB; import de.anomic.kelondro.blob.BLOBArray; +import de.anomic.kelondro.index.IntegerHandleIndex; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.order.ByteOrder; @@ -283,5 +284,44 @@ public final class ReferenceContainerArray { return donesomething; } + public static IntegerHandleIndex referenceHashes( + final File heapLocation, + final ReferenceFactory factory, + final ByteOrder termOrder, + final Row payloadrow) throws IOException { + + System.out.println("CELL REFERENCE COLLECTION startup"); + IntegerHandleIndex references = new IntegerHandleIndex(payloadrow.primaryKeyLength, termOrder, 0, 1000000); + String[] files = heapLocation.list(); + for (String f: files) { + if (f.length() < 22 && !f.startsWith("index") && !f.endsWith(".blob")) continue; + File fl = new File(heapLocation, f); + System.out.println("CELL REFERENCE COLLECTION opening blob " + fl); + CloneableIterator> ei = new ReferenceContainerCache.blobFileEntries(fl, factory, payloadrow); + + ReferenceContainer container; + final long start = System.currentTimeMillis(); + long lastlog = start - 27000; + int count = 0; + while (ei.hasNext()) { + container = ei.next(); + if (container == null) continue; + Iterator refi = container.entries(); + while (refi.hasNext()) { + references.inc(refi.next().metadataHash().getBytes(), 1); + } + count++; + // write a log + if (System.currentTimeMillis() - lastlog > 30000) { + System.out.println("CELL REFERENCE COLLECTION scanned " + count + " RWI index entries. "); + //Log.logInfo("COLLECTION INDEX REFERENCE COLLECTION", "scanned " + count + " RWI index entries. " + (((System.currentTimeMillis() - start) * (array.size() + array.free() - count) / count) / 60000) + " minutes remaining for this array"); + lastlog = System.currentTimeMillis(); + } + } + } + System.out.println("CELL REFERENCE COLLECTION finished"); + return references; + } + } diff --git a/source/de/anomic/kelondro/text/citationPrototype/CitationReferenceRow.java b/source/de/anomic/kelondro/text/citationPrototype/CitationReferenceRow.java index fc3111821..2402c31d6 100644 --- a/source/de/anomic/kelondro/text/citationPrototype/CitationReferenceRow.java +++ b/source/de/anomic/kelondro/text/citationPrototype/CitationReferenceRow.java @@ -34,7 +34,7 @@ import de.anomic.kelondro.order.MicroDate; import de.anomic.kelondro.text.Reference; import de.anomic.yacy.yacySeedDB; -public final class CitationReferenceRow /*implements Reference, Cloneable*/ { +public final class CitationReferenceRow implements Reference /*, Cloneable*/ { // this object stores citation attributes to URL references @@ -131,10 +131,10 @@ public final class CitationReferenceRow /*implements Reference, Cloneable*/ { return this.entry; } - public String urlHash() { + public String metadataHash() { return this.entry.getColString(col_urlhash, null); } - + public int virtualAge() { return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format } @@ -170,12 +170,6 @@ public final class CitationReferenceRow /*implements Reference, Cloneable*/ { public String toString() { return toPropertyForm(); } - - public boolean isNewer(final Reference other) { - if (other == null) return true; - if (this.lastModified() > other.lastModified()) return true; - return false; - } public boolean isOlder(final Reference other) { if (other == null) return false; @@ -184,6 +178,31 @@ public final class CitationReferenceRow /*implements Reference, Cloneable*/ { } public int hashCode() { - return this.urlHash().hashCode(); + return this.metadataHash().hashCode(); + } + + + public int distance() { + throw new UnsupportedOperationException(); + } + + public void join(Reference oe) { + throw new UnsupportedOperationException(); + } + + public int maxposition() { + throw new UnsupportedOperationException(); + } + + public int minposition() { + throw new UnsupportedOperationException(); + } + + public int position(int p) { + throw new UnsupportedOperationException(); + } + + public int positions() { + throw new UnsupportedOperationException(); } } diff --git a/source/de/anomic/plasma/parser/Condenser.java b/source/de/anomic/plasma/parser/Condenser.java index 425c5f400..bb93a69bd 100644 --- a/source/de/anomic/plasma/parser/Condenser.java +++ b/source/de/anomic/plasma/parser/Condenser.java @@ -538,6 +538,74 @@ public final class Condenser { } + /* + private static class unsievedWordsEnum implements Enumeration { + // returns an enumeration of StringBuilder Objects + StringBuilder buffer = null; + sentencesFromInputStreamEnum e; + StringBuilder s; + int off; + + public unsievedWordsEnum(final InputStream is) throws UnsupportedEncodingException { + e = new sentencesFromInputStreamEnum(is); + s = new StringBuilder(0); + off = 0; + buffer = nextElement0(); + } + + public void pre(final boolean x) { + e.pre(x); + } + + private StringBuilder nextElement0() { + StringBuilder r; + StringBuilder sb; + char c; + while (s.length() - off <= 0) { + if (e.hasNext()) { + r = e.next(); + if (r == null) return null; + r = trim(r); + sb = new StringBuilder(r.length() * 2); + for (int i = 0; i < r.length(); i++) { + c = r.charAt(i); + if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8 + else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' '); + else sb = sb.append(c); + } + s = trim(sb); + off = 0; + //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); + } else { + return null; + } + } + final int p = s.indexOf(" ", off); + if (p < 0) { + r = new StringBuilder(s.substring(off)); + s = new StringBuilder(0); + off = 0; + return r; + } + r = trim(new StringBuilder(s.substring(off, p))); + off = p + 1; + while (off < s.length() && s.charAt(off) <= ' ') off++; + return r; + } + + public boolean hasMoreElements() { + return buffer != null; + } + + public StringBuilder nextElement() { + final StringBuilder r = buffer; + buffer = nextElement0(); + return r; + } + + } + */ + private static class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuilder Objects StringBuilder buffer = null; diff --git a/source/de/anomic/plasma/plasmaRankingCRProcess.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java index 4b81ac122..be3b2973c 100644 --- a/source/de/anomic/plasma/plasmaRankingCRProcess.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -38,8 +38,9 @@ import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.order.CloneableIterator; import de.anomic.kelondro.order.MicroDate; import de.anomic.kelondro.table.EcoTable; -import de.anomic.kelondro.text.IndexCollection; +import de.anomic.kelondro.text.IndexCell; import de.anomic.kelondro.text.ReferenceContainer; +import de.anomic.kelondro.text.citationPrototype.CitationReferenceRow; import de.anomic.kelondro.text.referencePrototype.WordReference; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.MemoryControl; @@ -142,7 +143,7 @@ public class plasmaRankingCRProcess { return true; } - private static boolean accumulate_upd(final File f, final ObjectIndex acc, final IndexCollection seq) throws IOException { + private static boolean accumulate_upd(final File f, final ObjectIndex acc, final IndexCell seq) throws IOException { // open file AttrSeq source_cr = null; try { @@ -174,7 +175,7 @@ public class plasmaRankingCRProcess { Vita = (int) acc_entry.getColLong("Vita", 0); // update counters and dates - seq.put(key.getBytes(), new_entry.getSeqCollection()); // FIXME: old and new collection must be joined + //seq.add(key.getBytes(), new_entry.getSeqCollection()); UCount++; // increase update counter PCount += (new_flags.get(1)) ? 1 : 0; @@ -194,7 +195,7 @@ public class plasmaRankingCRProcess { for (int i = 1; i < acc.row().columns(); i++) { acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname, 0)); } - seq.put(key.getBytes(), new_entry.getSeqCollection()); + //seq.put(key.getBytes(), new_entry.getSeqCollection()); FUDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date FDDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack LUDate = (int) new_entry.getAttr("VDate", 0); @@ -242,11 +243,16 @@ public class plasmaRankingCRProcess { // open target file AttrSeq acc = null; ObjectIndex newacc = null; - IndexCollection newseq = null; + IndexCell newseq = null; if (newdb) { final File path = to_file.getParentFile(); // path to storage place newacc = new EcoTable(new File(path, CRG_accname), CRG_accrow, EcoTable.tailCacheUsageAuto, 0, 0); - newseq = new IndexCollection(path, CRG_seqname, plasmaWordIndex.wordReferenceFactory, 12, Base64Order.enhancedCoder, 9, CRG_colrow, false); + newseq = new IndexCell( + path, + plasmaWordIndex.wordReferenceFactory, + Base64Order.enhancedCoder, + CRG_colrow, + 10000, 1000000000L, 20, null, 1000000); } else { if (!(to_file.exists())) { acc = new AttrSeq("Global Ranking Accumulator File", @@ -266,6 +272,7 @@ public class plasmaRankingCRProcess { // open file source_file = new File(from_dir, files[i]); if (newdb) { + /* if (accumulate_upd(source_file, newacc, newseq)) { // move CR file to temporary folder source_file.renameTo(new File(tmp_dir, files[i])); @@ -273,6 +280,7 @@ public class plasmaRankingCRProcess { // error case: the CR-file is not valid; move to error path source_file.renameTo(new File(err_dir, files[i])); } + */ } else { if (accumulate_upd(source_file, acc)) { // move CR file to temporary folder @@ -374,8 +382,10 @@ public class plasmaRankingCRProcess { public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException { //kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true); - final IndexCollection seq = new IndexCollection(cr_path_in, CRG_seqname, plasmaWordIndex.wordReferenceFactory, 12, Base64Order.enhancedCoder, 9, CRG_colrow, false); - final IndexCollection rci = new IndexCollection(rci_path_out, RCI_colname, plasmaWordIndex.wordReferenceFactory, 6, Base64Order.enhancedCoder, 9, RCI_coli, false); + final IndexCell seq = new IndexCell( + cr_path_in, plasmaWordIndex.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000); + final IndexCell rci = new IndexCell( + rci_path_out, plasmaWordIndex.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000); // loop over all referees int count = 0; @@ -406,7 +416,7 @@ public class plasmaRankingCRProcess { rci_entry.add(refereeDom.getBytes()); // insert entry - rci.put(anchorDom.getBytes(), rci_entry); + //rci.put(anchorDom.getBytes(), rci_entry); } count++; if ((count % 1000) == 0) { diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index a3e0f3205..e6eafe252 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -46,7 +46,6 @@ import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.text.BufferedIndex; import de.anomic.kelondro.text.IndexCell; -import de.anomic.kelondro.text.IndexCollectionMigration; import de.anomic.kelondro.text.ReferenceContainer; import de.anomic.kelondro.text.IODispatcher; import de.anomic.kelondro.text.MetadataRepository; @@ -151,37 +150,18 @@ public final class plasmaWordIndex { } } - // check if the peer has migrated the index - if (new File(indexPrimaryTextLocation, "RICOLLECTION").exists()) { - this.merger = new IODispatcher(plasmaWordIndex.wordReferenceFactory, 1, 1, writeBufferSize); - if (this.merger != null) this.merger.start(); - this.index = new IndexCollectionMigration( - indexPrimaryTextLocation, - wordReferenceFactory, - wordOrder, - WordReferenceRow.urlEntryRow, - entityCacheMaxSize, - targetFileSize, - maxFileSize, - this.merger, - writeBufferSize, - log); - } else { - this.merger = new IODispatcher(plasmaWordIndex.wordReferenceFactory, 1, 1, writeBufferSize); - this.merger.start(); - this.index = new IndexCell( - new File(indexPrimaryTextLocation, "RICELL"), - wordReferenceFactory, - wordOrder, - WordReferenceRow.urlEntryRow, - entityCacheMaxSize, - targetFileSize, - maxFileSize, - this.merger, - writeBufferSize); - } - - + this.merger = new IODispatcher(plasmaWordIndex.wordReferenceFactory, 1, 1, writeBufferSize); + this.merger.start(); + this.index = new IndexCell( + new File(indexPrimaryTextLocation, "RICELL"), + wordReferenceFactory, + wordOrder, + WordReferenceRow.urlEntryRow, + entityCacheMaxSize, + targetFileSize, + maxFileSize, + this.merger, + writeBufferSize); // migrate LURL-db files into new subdirectory METADATA File textdir = new File(this.secondaryRoot, "TEXT");