From db1da3345d53d6d789cdb986ba66423d5bdb4d53 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 8 May 2005 14:42:09 +0000 Subject: [PATCH] introduced singleton-database git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@92 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- makerelease.sh | 4 +- .../kelondro/kelondroMergeIterator.java | 108 ++++++ source/de/anomic/kelondro/kelondroTree.java | 286 ++++++++------- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- .../anomic/plasma/plasmaWordIndexCache.java | 327 +++++++++++++----- .../anomic/plasma/plasmaWordIndexEntry.java | 3 + .../plasma/plasmaWordIndexEntryContainer.java | 23 +- 7 files changed, 543 insertions(+), 210 deletions(-) create mode 100644 source/de/anomic/kelondro/kelondroMergeIterator.java diff --git a/makerelease.sh b/makerelease.sh index 8b3b7d41c..6e616b041 100755 --- a/makerelease.sh +++ b/makerelease.sh @@ -45,7 +45,7 @@ # Contributions and changes to the program code must be marked as such. # define variables -version='0.371' +version='0.372' datestr=`date +%Y%m%d` #release='yacy_v'$version'_'$datestr release='yacy_dev_v'$version'_'$datestr @@ -64,7 +64,7 @@ echo "[`date +%Y/%m/%d\ %H:%M:%S`] Building yacy version $version - $datestr ... classpath="$classes" for N in `ls -1 lib/*.jar`; do classpath="$classpath:$N"; done for N in `ls -1 libx/*.jar`; do classpath="$classpath:$N"; done -echo "[`date +%Y/%m/%d\ %H:%M:%S`] Unsing classpath: $classpath" +echo "[`date +%Y/%m/%d\ %H:%M:%S`] Using classpath: $classpath" #classpath='$classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar' diff --git a/source/de/anomic/kelondro/kelondroMergeIterator.java b/source/de/anomic/kelondro/kelondroMergeIterator.java new file mode 100644 index 000000000..46fdcccb8 --- /dev/null +++ b/source/de/anomic/kelondro/kelondroMergeIterator.java @@ -0,0 +1,108 @@ +// kelondroMergeIterator.java +// -------------------------- +// part of The Kelondro Database +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 08.05.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.kelondro; + +import java.util.Iterator; +import java.util.Comparator; + +public class kelondroMergeIterator implements Iterator { + + Comparator comp; + Iterator a, b; + String na, nb; + boolean up; + + public kelondroMergeIterator(Iterator a, Iterator b, boolean up) { + // this works currently only for String-type key iterations + this.a = a; + this.b = b; + this.up = up; + this.comp = kelondroMSetTools.fastStringComparator(up); + nexta(); + nextb(); + } + + private void nexta() { + if (a.hasNext()) na = (String) a.next(); else na = null; + } + private void nextb() { + if (b.hasNext()) nb = (String) b.next(); else nb = null; + } + + public boolean hasNext() { + return (na != null) || (nb != null); + } + + public Object next() { + String s; + if (na == null) { + s = nb; + nextb(); + return s; + } + if (nb == null) { + s = na; + nexta(); + return s; + } + // compare the strings + int c = comp.compare(na, nb); + if (c == 0) { + s = na; + nexta(); + nextb(); + return s; + } else if ((up) && (c < 0)) { + s = na; + nexta(); + return s; + } else { + s = nb; + nextb(); + return s; + } + } + + public void remove() { + throw new java.lang.UnsupportedOperationException("merge does not support remove"); + } +} diff --git a/source/de/anomic/kelondro/kelondroTree.java b/source/de/anomic/kelondro/kelondroTree.java index 83b087e0e..2105efe5d 100644 --- a/source/de/anomic/kelondro/kelondroTree.java +++ b/source/de/anomic/kelondro/kelondroTree.java @@ -85,13 +85,13 @@ public class kelondroTree extends kelondroRecords implements Comparator { } public kelondroTree(File file, long buffersize, int[] columns) throws IOException { - // this creates a new tree + // this creates a new tree file this(file, buffersize, columns, columns.length /*txtProps*/, 80 /*txtPropWidth*/); } public kelondroTree(File file, long buffersize, int[] columns, int txtProps, int txtPropsWidth) throws IOException { - // this creates a new tree + // this creates a new tree file super(file, buffersize, thisOHBytes, thisOHHandles, columns, thisFHandles, columns.length /*txtProps*/, 80 /*txtPropWidth*/); @@ -99,23 +99,23 @@ public class kelondroTree extends kelondroRecords implements Comparator { } public kelondroTree(kelondroRA ra, long buffersize, int[] columns) throws IOException { - // this creates a new tree + // this creates a new tree within a kelondroRA this(ra, buffersize, columns, columns.length /*txtProps*/, 80 /*txtPropWidth*/); } public kelondroTree(kelondroRA ra, long buffersize, int[] columns, int txtProps, int txtPropsWidth) throws IOException { - // this creates a new tree + // this creates a new tree within a kelondroRA super(ra, buffersize, thisOHBytes, thisOHHandles, columns, thisFHandles, txtProps, txtPropsWidth); setHandle(root, null); // define the root value } public kelondroTree(File file, long buffersize) throws IOException{ - // this opens a file with an existing tree + // this opens a file with an existing tree file super(file, buffersize); } public kelondroTree(kelondroRA ra, long buffersize) throws IOException{ - // this opens a file with an existing tree + // this opens a file with an existing tree in a kelondroRA super(ra, buffersize); } @@ -277,117 +277,6 @@ public class kelondroTree extends kelondroRecords implements Comparator { return (lc.equals(childn.handle())); } - private class nodeIterator implements Iterator { - // we implement an iteration! (not a recursive function as the structure would suggest...) - // the iterator iterates Handle objects - Node nextNode = null; - boolean up, rot; - LinkedList nodeStack; - int count; - - public nodeIterator(boolean up, boolean rotating) throws IOException { - this(up, rotating, (up) ? firstNode() : lastNode()); - } - - public nodeIterator(boolean up, boolean rotating, Node start) throws IOException { - this.count = 0; - this.up = up; - this.rot = rotating; - this.nextNode = start; - - // fill node stack for start node - nodeStack = new LinkedList(); - - Handle searchHandle = getHandle(root); - if (searchHandle == null) {nextNode = null; return;} - - Node searchNode = getNode(searchHandle, null, 0); - byte[] startKey = start.getKey(); - int c, ct; - while ((c = compare(startKey, searchNode.getKey())) != 0) { - // the current 'thisNode' is not the start node, put it on the stack - ct = (c < 0) ? leftchild : rightchild; - nodeStack.addLast(new Object[]{searchNode, new Integer(ct)}); - - // go to next node - searchHandle = searchNode.getOHHandle()[ct]; - if (searchHandle == null) throw new kelondroException(filename, "start node does not exist (handle null)"); - searchNode = getNode(searchHandle, searchNode, ct); - if (searchNode == null) throw new kelondroException(filename, "start node does not exist (node null)"); - } - // now every parent node to the start node is on the stack - } - - public boolean hasNext() { - return nextNode != null; - } - - public Object next() { - count++; - if (nextNode == null) throw new kelondroException(filename, "no more entries available"); - if (count > size()) throw new kelondroException(filename, "internal loopback; database corrupted"); - Object ret = nextNode; - - // middle-case - - try { - int childtype = (up) ? rightchild : leftchild; - Handle childHandle = nextNode.getOHHandle()[childtype]; - if (childHandle != null) { - //System.out.println("go to other leg, stack size=" + nodeStack.size()); - // we have walked one leg of the tree; now go to the other one: step down to next child - nodeStack.addLast(new Object[]{nextNode, new Integer(childtype)}); - nextNode = getNode(childHandle, nextNode, childtype); - childtype = (up) ? leftchild : rightchild; - while ((childHandle = nextNode.getOHHandle()[childtype]) != null) { - try { - nodeStack.addLast(new Object[]{nextNode, new Integer(childtype)}); - nextNode = getNode(childHandle, nextNode, childtype); - } catch (IllegalArgumentException e) { - // return what we have - nodeStack.removeLast(); - return ret; - } - } - // thats it: we are at a place where we can't go further - // nextNode is correct - } else { - //System.out.println("go up"); - // we have walked along both legs of the child-trees. - - // Now step up. - if (nodeStack.size() == 0) { - nextNode = null; - } else { - Object[] stacktop; - Node parent = null; - int parentpointer = (up) ? rightchild : leftchild; - while ((nodeStack.size() != 0) && (parentpointer == ((up) ? rightchild : leftchild))) { - //System.out.println("step up"); - // go on, walk up further - stacktop = (Object[]) nodeStack.removeLast(); // top of stack: Node/parentpointer pair - parent = (Node) stacktop[0]; - parentpointer = ((Integer) stacktop[1]).intValue(); - } - if ((nodeStack.size() == 0) && (parentpointer == ((up) ? rightchild : leftchild))) { - nextNode = null; - } else { - nextNode = parent; - } - } - } - } catch (IOException e) { - nextNode = null; - } - - return ret; - } - - public void remove() { - throw new java.lang.UnsupportedOperationException("kelondroTree: remove in kelondro Tables not yet supported"); - } - } - public long[] putLong(byte[] key, long[] newlongs) throws IOException { byte[][] newrow = new byte[newlongs.length + 1][]; newrow[0] = key; @@ -847,9 +736,120 @@ public class kelondroTree extends kelondroRecords implements Comparator { } } + private class nodeIterator implements Iterator { + // we implement an iteration! (not a recursive function as the structure would suggest...) + // the iterator iterates Node objects + Node nextNode = null; + boolean up, rot; + LinkedList nodeStack; + int count; + + public nodeIterator(boolean up, boolean rotating) throws IOException { + this(up, rotating, (up) ? firstNode() : lastNode()); + } + + public nodeIterator(boolean up, boolean rotating, Node start) throws IOException { + this.count = 0; + this.up = up; + this.rot = rotating; + this.nextNode = start; + + // fill node stack for start node + nodeStack = new LinkedList(); + + Handle searchHandle = getHandle(root); + if (searchHandle == null) {nextNode = null; return;} + + Node searchNode = getNode(searchHandle, null, 0); + byte[] startKey = start.getKey(); + int c, ct; + while ((c = compare(startKey, searchNode.getKey())) != 0) { + // the current 'thisNode' is not the start node, put it on the stack + ct = (c < 0) ? leftchild : rightchild; + nodeStack.addLast(new Object[]{searchNode, new Integer(ct)}); + + // go to next node + searchHandle = searchNode.getOHHandle()[ct]; + if (searchHandle == null) throw new kelondroException(filename, "start node does not exist (handle null)"); + searchNode = getNode(searchHandle, searchNode, ct); + if (searchNode == null) throw new kelondroException(filename, "start node does not exist (node null)"); + } + // now every parent node to the start node is on the stack + } + + public boolean hasNext() { + return nextNode != null; + } + + public Object next() { + count++; + if (nextNode == null) throw new kelondroException(filename, "no more entries available"); + if ((count > size()) && (!(rot))) throw new kelondroException(filename, "internal loopback; database corrupted"); + Object ret = nextNode; + + // middle-case + + try { + int childtype = (up) ? rightchild : leftchild; + Handle childHandle = nextNode.getOHHandle()[childtype]; + if (childHandle != null) { + //System.out.println("go to other leg, stack size=" + nodeStack.size()); + // we have walked one leg of the tree; now go to the other one: step down to next child + nodeStack.addLast(new Object[]{nextNode, new Integer(childtype)}); + nextNode = getNode(childHandle, nextNode, childtype); + childtype = (up) ? leftchild : rightchild; + while ((childHandle = nextNode.getOHHandle()[childtype]) != null) { + try { + nodeStack.addLast(new Object[]{nextNode, new Integer(childtype)}); + nextNode = getNode(childHandle, nextNode, childtype); + } catch (IllegalArgumentException e) { + // return what we have + nodeStack.removeLast(); + return ret; + } + } + // thats it: we are at a place where we can't go further + // nextNode is correct + } else { + //System.out.println("go up"); + // we have walked along both legs of the child-trees. + + // Now step up. + if (nodeStack.size() == 0) { + nextNode = null; + } else { + Object[] stacktop; + Node parent = null; + int parentpointer = (up) ? rightchild : leftchild; + while ((nodeStack.size() != 0) && (parentpointer == ((up) ? rightchild : leftchild))) { + //System.out.println("step up"); + // go on, walk up further + stacktop = (Object[]) nodeStack.removeLast(); // top of stack: Node/parentpointer pair + parent = (Node) stacktop[0]; + parentpointer = ((Integer) stacktop[1]).intValue(); + } + if ((nodeStack.size() == 0) && (parentpointer == ((up) ? rightchild : leftchild))) { + nextNode = null; + } else { + nextNode = parent; + } + } + } + } catch (IOException e) { + nextNode = null; + } + + return ret; + } + + public void remove() { + throw new java.lang.UnsupportedOperationException("kelondroTree: remove in kelondro Tables not yet supported"); + } + } + public synchronized rowIterator rows(boolean up, boolean rotating) throws IOException { - // iterates only the keys of the Nodes - // enumerated objects are of type byte[] + // iterates the rows of the Nodes + // enumerated objects are of type byte[][] // iterates the elements in a sorted way. return new rowIterator(new nodeIterator(up, rotating)); } @@ -895,6 +895,54 @@ public class kelondroTree extends kelondroRecords implements Comparator { } + public synchronized keyIterator keys(boolean up, boolean rotating) throws IOException { + // iterates only the keys of the Nodes + // enumerated objects are of type String + // iterates the elements in a sorted way. + return new keyIterator(new nodeIterator(up, rotating)); + } + + public synchronized Iterator keys(boolean up, boolean rotating, byte[] firstKey) throws IOException { + Search s = new Search(firstKey); + if (s.found()) { + return new keyIterator(new nodeIterator(up, rotating, s.getMatcher())); + } else { + Node nn = s.getParent(); + if (nn == null) { + return (Iterator) (new HashSet()).iterator(); + } else { + return new keyIterator(new nodeIterator(up, rotating, nn)); + } + } + } + + public class keyIterator implements Iterator { + + Iterator nodeIterator; + + public keyIterator(Iterator nodeIterator) { + this.nodeIterator = nodeIterator; + } + + public boolean hasNext() { + return (nodeIterator.hasNext()); + } + + public Object next() { + try { + Node nextNode = (Node) nodeIterator.next(); + if (nextNode == null) throw new kelondroException(filename, "no more elements available"); + return new String(nextNode.getKey()); + } catch (IOException e) { + throw new kelondroException(filename, "io-error: " + e.getMessage()); + } + } + + public void remove() { + } + + } + public int imp(File file, String separator) throws IOException { // imports a value-separated file, returns number of records that have been read diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 0d9bb1aea..fd7a70c7a 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -65,7 +65,7 @@ public class plasmaWordIndex { public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException { this.databaseRoot = databaseRoot; plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log); - this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, log); + this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, 1000000, log); } public int maxURLinWordCache() { diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java index 8594e423f..f2be090d5 100644 --- a/source/de/anomic/plasma/plasmaWordIndexCache.java +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -51,10 +51,17 @@ import de.anomic.yacy.yacySeedDB; public class plasmaWordIndexCache implements plasmaWordIndexInterface { + // environment constants private static final String indexDumpFileName = "indexDump0.stack"; + private static final String singletonFileName = "indexSingletons0.db"; + private static final int[] bufferStructure = new int[]{ + plasmaWordIndexEntry.wordHashLength, // a wordHash + 4, // occurrence counter + 8, // timestamp of last access + plasmaWordIndexEntry.urlHashLength, // corresponding URL hash + plasmaWordIndexEntry.attrSpaceLong // URL attributes + }; - static String minKey, maxKey; - // class variables private File databaseRoot; private plasmaWordIndexInterface backend; @@ -63,7 +70,11 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { private HashMap hashDate; private int maxWords; private serverLog log; + private kelondroTree singletons; + private long singletonBufferSize; + // calculated constants + private static String minKey, maxKey; static { maxKey = ""; for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z'; @@ -71,14 +82,38 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; } - public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, serverLog log) { + public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, long singletonBufferSize, serverLog log) { + // creates a new index cache + // the cache has a back-end where indexes that do not fit in the cache are flushed this.databaseRoot = databaseRoot; + this.singletonBufferSize = singletonBufferSize; this.cache = new TreeMap(); this.hashScore = new kelondroMScoreCluster(); this.hashDate = new HashMap(); this.maxWords = 10000; this.backend = backend; this.log = log; + File singletonFile = new File(databaseRoot, singletonFileName); + if (singletonFile.exists()) { + // open existing singeton tree file + try { + singletons = new kelondroTree(singletonFile, singletonBufferSize); + log.logSystem("Opened Singleton Database, " + singletons.size() + " entries."); + } catch (IOException e){ + log.logError("unable to open singleton database: " + e.getMessage()); + e.printStackTrace(); + } + } else { + // create new sigleton tree file + try { + singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure); + log.logSystem("Created new Singleton Database"); + } catch (IOException e){ + log.logError("unable to create singleton database: " + e.getMessage()); + e.printStackTrace(); + } + } + // read in dump of last session try { restore(); } catch (IOException e){ @@ -91,7 +126,7 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)"); File indexDumpFile = new File(databaseRoot, indexDumpFileName); if (indexDumpFile.exists()) indexDumpFile.delete(); - kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, new int[]{plasmaWordIndexEntry.wordHashLength, 4, 8, plasmaWordIndexEntry.wordHashLength, plasmaWordIndexEntry.attrSpaceLong}); + kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, bufferStructure); long startTime = System.currentTimeMillis(); long messageTime = System.currentTimeMillis() + 5000; long wordsPerSecond = 0, wordcount = 0, urlcount = 0; @@ -151,19 +186,18 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { synchronized (cache) { Iterator i = dumpStack.iterator(); kelondroRecords.Node node; - String wordHash, urlHash; + String wordHash; plasmaWordIndexEntryContainer container; long creationTime; plasmaWordIndexEntry wordEntry; - byte[][] row = new byte[4][]; + byte[][] row; while (i.hasNext()) { // get out one entry node = (kelondroRecords.Node) i.next(); row = node.getValues(); wordHash = new String(row[0]); creationTime = kelondroRecords.bytes2long(row[2]); - urlHash = new String(row[3]); - wordEntry = new plasmaWordIndexEntry(urlHash, new String(row[4])); + wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4])); // store to cache addEntry(wordHash, wordEntry, creationTime); @@ -180,6 +214,99 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); return urlCount; } + + // singleton access methods + + private void storeSingleton(String wordHash, plasmaWordIndexEntry entry, long creationTime) { + // stores a word index to singleton database + // this throws an exception if the word hash already existed + //log.logDebug("storeSingleton: wordHash=" + wordHash + ", urlHash=" + entry.getUrlHash() + ", time=" + creationTime); + byte[][] row = new byte[5][]; + row[0] = wordHash.getBytes(); + row[1] = kelondroRecords.long2bytes(1, 4); + row[2] = kelondroRecords.long2bytes(creationTime, 8); + row[3] = entry.getUrlHash().getBytes(); + row[4] = entry.toEncodedForm(true).getBytes(); + byte[][] oldrow = null; + try { + oldrow = singletons.put(row); + } catch (IOException e) { + log.logFailure("storeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } catch (kelondroException e) { + log.logFailure("storeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } + if (oldrow != null) throw new RuntimeException("Store to singleton ambiguous"); + } + + public Object[] /*{plasmaWordIndexEntry, Long(creationTime)}*/ readSingleton(String wordHash) { + // returns a single word index from singleton database; returns null if index does not exist + //log.logDebug("readSingleton: wordHash=" + wordHash); + byte[][] row = null; + try { + row = singletons.get(wordHash.getBytes()); + } catch (IOException e) { + log.logFailure("readSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } catch (kelondroException e) { + log.logFailure("readSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } + if (row == null) return null; + long creationTime = kelondroRecords.bytes2long(row[2]); + plasmaWordIndexEntry wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4])); + return new Object[]{wordEntry, new Long(creationTime)}; + } + + private void removeSingleton(String wordHash) { + // deletes a word index from singleton database + //log.logDebug("removeSingleton: wordHash=" + wordHash); + byte[][] row = null; + try { + row = singletons.remove(wordHash.getBytes()); + } catch (IOException e) { + log.logFailure("removeSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } catch (kelondroException e) { + log.logFailure("removeSingleton/kelondro-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + } + } + + private void resetSingletonDatabase() { + // deletes the singleton database and creates a new one + try { + singletons.close(); + } catch (IOException e) {} + File singletonFile = new File(databaseRoot, singletonFileName); + if (!(singletonFile.delete())) throw new RuntimeException("cannot delete singleton database"); + try { + singletons = new kelondroTree(singletonFile, singletonBufferSize, bufferStructure); + } catch (IOException e){ + log.logError("unable to re-create singleton database: " + e.getMessage()); + e.printStackTrace(); + } + } + + public Iterator singletonHashes(String startWordHash, boolean up, boolean rot) { + try { + return singletons.keys(up, rot, startWordHash.getBytes()); + } catch (IOException e) { + log.logFailure("iterateSingleton/IO-error: " + e.getMessage() + " - reset singleton-DB"); + e.printStackTrace(); + resetSingletonDatabase(); + return null; + } + } + + // cache settings public int maxURLinWordCache() { return hashScore.getScore(hashScore.getMaxObject()); @@ -194,92 +321,90 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { } public int size() { - if (backend.size() > cache.size()) return backend.size(); else return cache.size(); + return java.lang.Math.max(singletons.size(), java.lang.Math.max(backend.size(), cache.size())); } public Iterator wordHashes(String startWordHash, boolean up) { + // here we merge 3 databases into one view: + // - the RAM Cache + // - the singleton File Cache + // - the backend if (!(up)) throw new RuntimeException("plasmaWordIndexCache.wordHashes can only count up"); - return new iterateCombined(cache.keySet().iterator(), backend.wordHashes(startWordHash, true), true); - } - - public class iterateCombined implements Iterator { - - Comparator comp; - Iterator a, b; - String na, nb; - boolean up; - - public iterateCombined(Iterator a, Iterator b, boolean up) { - this.a = a; - this.b = b; - this.up = up; - this.comp = kelondroMSetTools.fastStringComparator(up); - nexta(); - nextb(); - } - - private void nexta() { - if (a.hasNext()) na = (String) a.next(); else na = null; - } - private void nextb() { - if (b.hasNext()) nb = (String) b.next(); else nb = null; - } - - public boolean hasNext() { - return (na != null) || (nb != null); - } - - public Object next() { - String s; - if (na == null) { - s = nb; - nextb(); - return s; - } - if (nb == null) { - s = na; - nexta(); - return s; - } - // compare the strings - int c = comp.compare(na, nb); - if (c == 0) { - s = na; - //System.out.println("Iterate Hash: take " + s + " from file&cache"); - nexta(); - nextb(); - return s; - } else if ((up) && (c < 0)) { - s = na; - nexta(); - return s; - } else { - s = nb; - nextb(); - return s; - } - } - - public void remove() { - - } + return new kelondroMergeIterator( + new kelondroMergeIterator( + cache.keySet().iterator(), + singletonHashes(startWordHash, true, false), + true), + backend.wordHashes(startWordHash, true), + true); } - private int flushKey(String key) { + private int flushFromMem(String key) { plasmaWordIndexEntryContainer container = null; long time; synchronized (cache) { + // get the container container = (plasmaWordIndexEntryContainer) cache.get(key); if (container == null) return 0; // flushing of nonexisting key time = getCreationTime(key); - cache.remove(key); + + // remove it from the cache + cache.remove(key); hashScore.deleteScore(key); hashDate.remove(key); } - return backend.addEntries(container, time); + // now decide where to flush that container + Object[] singleton = readSingleton(key); + if (singleton == null) { + if (container.size() == 1) { + // store to singleton + storeSingleton(key, container.getOne(), time); + return 1; + } else { + // store to back-end + return backend.addEntries(container, time); + } + } else { + // we have a singleton and need to integrate this in the flush + plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) singleton[0]; + long oldTime = ((Long) singleton[1]).longValue(); + if (container.contains(oldEntry.getUrlHash())) { + // we have an double-occurrence + if (container.size() == 1) { + // it is superfluous to flush this, simple do nothing + return 0; + } else { + // we flush to the backend, but remove the entry from the singletons + removeSingleton(key); + return backend.addEntries(container, java.lang.Math.max(time, oldTime)); + } + } else { + // now we have more than one entry, + // we must remove the key from the singleton database + removeSingleton(key); + // add this to the backend + container.add(oldEntry); + return backend.addEntries(container, java.lang.Math.max(time, oldTime)); + } + } + } + + private boolean flushFromSingleton(String key) { + Object[] singleton = readSingleton(key); + if (singleton == null) { + return false; + } else { + // we have a singleton + plasmaWordIndexEntry entry = (plasmaWordIndexEntry) singleton[0]; + long time = ((Long) singleton[1]).longValue(); + // remove it from the singleton database + removeSingleton(key); + // integrate it to the backend + return backend.addEntries(plasmaWordIndexEntryContainer.instantContainer(key, entry), time) > 0; + } } - private int flushToLimit() { + private int flushFromMemToLimit() { if ((hashScore.size() == 0) && (cache.size() == 0)) { serverLog.logDebug("PLASMA INDEXING", "flushToLimit: called but cache is empty"); return 0; @@ -299,25 +424,47 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { String key; int count; Long createTime; - while (hashScore.size() >= maxWords) { + + // flush high-scores + while ((total < 100) && (hashScore.size() >= maxWords)) { key = (String) hashScore.getMaxObject(); createTime = (Long) hashDate.get(key); count = hashScore.getScore(key); + if (count < 5) { + log.logWarning("flushing of high-key " + key + " not appropriate (too less entries, count=" + count + "): increase cache size"); + break; + } + if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) { + //log.logDebug("high-key " + key + " is too fresh, interrupting flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")"); + break; + } + //log.logDebug("flushing high-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); + total += flushFromMem(key); + } + + // flush singletons + while ((total < 200) && (hashScore.size() >= maxWords)) { + key = (String) hashScore.getMinObject(); + createTime = (Long) hashDate.get(key); + count = hashScore.getScore(key); + if (count > 1) { + //log.logDebug("flush of singleton-key " + key + ": count too high (count=" + count + ")"); + break; + } if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) { - log.logDebug("key " + key + " is too fresh, abandon flush (count=" + count + ", cachesize=" + cache.size() + ")"); + //log.logDebug("singleton-key " + key + " is too fresh, interruptiong flush (count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size() + ")"); break; } - if (count < 5) log.logWarning("flushing of key " + key + " not appropriate (too less entries, count=" + count + "): increase cache size"); - log.logDebug("flushing key " + key + ", count=" + count + ", cachesize=" + cache.size()); - total += flushKey(key); - if (total > 100) break; + //log.logDebug("flushing singleton-key " + key + ", count=" + count + ", cachesize=" + cache.size() + ", singleton-size=" + singletons.size()); + total += flushFromMem(key); } } return total; } public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) { - flushKey(wordHash); + flushFromMem(wordHash); + flushFromSingleton(wordHash); return backend.getIndex(wordHash, deleteIfEmpty); } @@ -333,17 +480,19 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { hashScore.deleteScore(wordHash); hashDate.remove(wordHash); } + removeSingleton(wordHash); backend.deleteIndex(wordHash); } public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { - flushKey(wordHash); + flushFromMem(wordHash); + flushFromSingleton(wordHash); return backend.removeEntries(wordHash, urlHashes, deleteComplete); } public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); - flushToLimit(); + flushFromMemToLimit(); //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); // put new words into cache @@ -374,6 +523,12 @@ public class plasmaWordIndexCache implements plasmaWordIndexInterface { } public void close(int waitingSeconds) { + try { + singletons.close(); + } catch (IOException e){ + log.logError("unable to close singleton database: " + e.getMessage()); + e.printStackTrace(); + } try { dump(waitingSeconds); } catch (IOException e){ diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 3dcf3f0d2..a12681feb 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -59,6 +59,7 @@ public class plasmaWordIndexEntry { // the size of a word hash public static final int wordHashLength = yacySeedDB.commonHashLength; // 12 + public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 // the size of the index entry attributes public static final int attrSpaceShort = 12; @@ -201,6 +202,8 @@ public class plasmaWordIndexEntry { } public String toEncodedForm(boolean longAttr) { + // attention: this integrates NOT the URL into the encoding + // if you need a complete dump, use toExternalForm() if (code == null) { String shortAttr = b64save(quality, plasmaCrawlLURL.urlQualityLength) + diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java index 13c252e9a..133ccce23 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -41,9 +41,11 @@ package de.anomic.plasma; -import java.util.*; +import java.util.HashMap; +import java.util.Iterator; +import de.anomic.server.serverCodings; -public class plasmaWordIndexEntryContainer { +public class plasmaWordIndexEntryContainer implements Comparable { private String wordHash; private HashMap container; @@ -79,6 +81,14 @@ public class plasmaWordIndexEntryContainer { return x; } + public boolean contains(String urlHash) { + return container.containsKey(urlHash); + } + + public plasmaWordIndexEntry getOne() { + return (plasmaWordIndexEntry) container.values().toArray()[0]; + } + public Iterator entries() { // returns an iterator of plasmaWordIndexEntry objects return container.values().iterator(); @@ -94,4 +104,13 @@ public class plasmaWordIndexEntryContainer { return "C[" + wordHash + "] has " + container.size() + " entries"; } + public int compareTo(Object obj) { + plasmaWordIndexEntryContainer other = (plasmaWordIndexEntryContainer) obj; + return this.wordHash.compareTo(other.wordHash); + } + + public int hashCode() { + return (int) serverCodings.enhancedCoder.decodeBase64Long(this.wordHash.substring(0, 4)); + } + }