diff --git a/source/de/anomic/crawler/RobotsTxt.java b/source/de/anomic/crawler/RobotsTxt.java index adbd4b908..341bf6e53 100644 --- a/source/de/anomic/crawler/RobotsTxt.java +++ b/source/de/anomic/crawler/RobotsTxt.java @@ -197,7 +197,13 @@ public class RobotsTxt { } // store the data into the robots DB + int sz = this.robotsTable.size(); addEntry(robotsTxt4Host); + if (this.robotsTable.size() <= sz) { + Log.logSevere("RobotsTxt", "new entry in robots.txt table failed, reseing database"); + this.resetDatabase(); + addEntry(robotsTxt4Host); + } } else { final robotsParser parserResult = new robotsParser((byte[]) result[DOWNLOAD_ROBOTS_TXT]); ArrayList denyPath = parserResult.denyList(); diff --git a/source/de/anomic/http/httpTemplate.java b/source/de/anomic/http/httpTemplate.java index efaeec8b1..89226b415 100644 --- a/source/de/anomic/http/httpTemplate.java +++ b/source/de/anomic/http/httpTemplate.java @@ -359,7 +359,7 @@ public final class httpTemplate { if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(slash, key, null,null))) { pis2 = new PushbackInputStream(new ByteArrayInputStream(text.getBytes())); //this maybe the wrong, but its the last - structure.append('<').append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes("UTF-8")).append("\" found=\"0\">\n".getBytes()); + structure.append('<').append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes()).append("\" found=\"0\">\n".getBytes()); structure.append(writeTemplate(pis2, out, pattern, dflt, newPrefix(prefix,key))); structure.append("\n".getBytes()); found=true; @@ -381,7 +381,7 @@ public final class httpTemplate { if ((bb & 0xFF) == ':'){ if(currentPattern == whichPattern){ //found the pattern pis2 = new PushbackInputStream(new ByteArrayInputStream(text.getBytes())); - structure.append("<".getBytes()).append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes("UTF-8")).append("\" found=\"0\">\n".getBytes()); + structure.append('<').append(key).append(" type=\"alternative\" which=\"".getBytes()).append(Integer.toString(whichPattern).getBytes()).append("\" found=\"0\">\n".getBytes()); structure.append(writeTemplate(pis2, out, pattern, dflt, newPrefix(prefix,key))); structure.append("\n".getBytes()); diff --git a/source/de/anomic/kelondro/blob/BLOBArray.java b/source/de/anomic/kelondro/blob/BLOBArray.java index 65e9f12d8..895b0809c 100755 --- a/source/de/anomic/kelondro/blob/BLOBArray.java +++ b/source/de/anomic/kelondro/blob/BLOBArray.java @@ -418,7 +418,7 @@ public class BLOBArray implements BLOB { * ask for the number of blob entries in each blob of the blob array * @return the number of entries in each blob */ - public synchronized int[] sizes() { + public int[] sizes() { int[] s = new int[blobs.size()]; int c = 0; for (blobItem bi: blobs) s[c++] = bi.blob.size(); diff --git a/source/de/anomic/kelondro/blob/BLOBHeap.java b/source/de/anomic/kelondro/blob/BLOBHeap.java index 19536142c..79de6442a 100755 --- a/source/de/anomic/kelondro/blob/BLOBHeap.java +++ b/source/de/anomic/kelondro/blob/BLOBHeap.java @@ -272,6 +272,7 @@ public final class BLOBHeap extends BLOBHeapModifier implements BLOB { if (b.length == 0) return; // first remove the old entry (removes from buffer and file) + // TODO: this can be enhanced! this.remove(key); // then look if we can use a free entry diff --git a/source/de/anomic/kelondro/blob/BLOBHeapModifier.java b/source/de/anomic/kelondro/blob/BLOBHeapModifier.java index ad89a810d..57a4105b8 100644 --- a/source/de/anomic/kelondro/blob/BLOBHeapModifier.java +++ b/source/de/anomic/kelondro/blob/BLOBHeapModifier.java @@ -120,6 +120,9 @@ public class BLOBHeapModifier extends HeapReader implements BLOB { final long seek = index.get(key); if (seek < 0) return; + // check consistency of the index + assert (checkKey(key, seek)) : "key compare failed; key = " + new String(key) + ", seek = " + seek; + // access the file and read the container this.file.seek(seek); int size = file.readInt(); @@ -248,13 +251,16 @@ public class BLOBHeapModifier extends HeapReader implements BLOB { throw new UnsupportedOperationException("put is not supported in BLOBHeapModifier"); } - public int replace(byte[] key, Rewriter rewriter) throws IOException { + public synchronized int replace(byte[] key, Rewriter rewriter) throws IOException { assert index.row().primaryKeyLength == key.length : index.row().primaryKeyLength + "!=" + key.length; // check if the index contains the key final long pos = index.get(key); if (pos < 0) return 0; + // check consistency of the index + assert (checkKey(key, pos)) : "key compare failed; key = " + new String(key) + ", seek = " + pos; + // access the file and read the container file.seek(pos); final int len = file.readInt() - index.row().primaryKeyLength; diff --git a/source/de/anomic/kelondro/blob/HeapReader.java b/source/de/anomic/kelondro/blob/HeapReader.java index 09bf2d9dc..d689f35ca 100644 --- a/source/de/anomic/kelondro/blob/HeapReader.java +++ b/source/de/anomic/kelondro/blob/HeapReader.java @@ -29,7 +29,6 @@ import java.io.DataInputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; @@ -49,7 +48,7 @@ public class HeapReader { public final static long keepFreeMem = 20 * 1024 * 1024; protected int keylength; // the length of the primary key - protected HandleMap index; // key/seek relation for used records + protected HandleMap index; // key/seek relation for used records protected Gap free; // set of {seek, size} pairs denoting space and position of free records protected File heapFile; // the file of the heap protected final ByteOrder ordering; // the ordering on keys @@ -260,7 +259,7 @@ public class HeapReader { file.readFully(keyf, 0, keyf.length); if (!this.ordering.equal(key, keyf)) { // verification of the indexed access failed. we must re-read the index - Log.logWarning("kelondroBLOBHeap", "verification indexed access for " + heapFile.toString() + " failed, re-building index"); + Log.logSevere("kelondroBLOBHeap", "verification indexed access for " + heapFile.toString() + " failed, re-building index"); // this is a severe operation, it should never happen. // but if the process ends in this state, it would completely fail // if the index is not rebuild now at once @@ -273,9 +272,19 @@ public class HeapReader { return blob; } + + protected boolean checkKey(final byte[] key, final long pos) throws IOException { + file.seek(pos); + file.readInt(); // skip the size value + + // read the key + final byte[] keyf = new byte[index.row().primaryKeyLength]; + file.readFully(keyf, 0, keyf.length); + return this.ordering.equal(key, keyf); + } /** - * retrieve the size of the BLOB + * retrieve the size of the BLOB. This should not be used excessively, because it depends on IO operations. * @param key * @return the size of the BLOB or -1 if the BLOB does not exist * @throws IOException @@ -362,17 +371,6 @@ public class HeapReader { public long length() throws IOException { return this.heapFile.length(); } - - public String excave(final byte[] rawKey, char fillChar) { - int n = this.keylength - 1; - if (n >= rawKey.length) n = rawKey.length - 1; - while ((n > 0) && (rawKey[n] == (byte) fillChar)) n--; - try { - return new String(rawKey, 0, n + 1, "UTF-8"); - } catch (UnsupportedEncodingException e) { - return new String(rawKey, 0, n + 1); - } - } /** * static iterator of entries in BLOBHeap files: diff --git a/source/de/anomic/kelondro/index/IndexTest.java b/source/de/anomic/kelondro/index/IndexTest.java index 8921c461d..538106cd5 100644 --- a/source/de/anomic/kelondro/index/IndexTest.java +++ b/source/de/anomic/kelondro/index/IndexTest.java @@ -57,14 +57,9 @@ public class IndexTest { public static final long mb = 1024 * 1024; public static void main(String[] args) { - System.out.println("Performance test: comparing HashMap, TreeMap and kelondroRow"); - if (args.length == 0) { - System.out.println("use one parameter: number of test entries"); - System.exit(0); - } // pre-generate test data so it will not influence test case time - int count = Integer.parseInt(args[0]); + int count = args.length == 0 ? 1000000 : Integer.parseInt(args[0]); byte[][] tests = new byte[count][]; Random r = new Random(0); for (int i = 0; i < count; i++) tests[i] = randomHash(r); @@ -120,7 +115,7 @@ public class IndexTest { for (int i = 0; i < count; i++) ii.putUnique(tests[i], 1); ii.get(randomHash(r)); // trigger sort long t6 = System.currentTimeMillis(); - System.out.println("time for kelondroMap generation: " + (t6 - t5)); + System.out.println("time for HandleMap generation: " + (t6 - t5)); bugs = 0; for (int i = 0; i < count; i++) if (ii.get(tests[i]) != 1) bugs++; @@ -128,8 +123,8 @@ public class IndexTest { long freeEndKelondro = MemoryControl.available(); ii.clear(); ii = null; long t7 = System.currentTimeMillis(); - System.out.println("time for kelondroMap test: " + (t7 - t6) + ", " + bugs + " bugs"); - System.out.println("memory for kelondroMap: " + (freeStartKelondro - freeEndKelondro) / mb + " MB\n"); + System.out.println("time for HandleMap test: " + (t7 - t6) + ", " + bugs + " bugs"); + System.out.println("memory for HandleMap: " + (freeStartKelondro - freeEndKelondro) / mb + " MB\n"); // test ByteArray System.out.println("unsorted map"); diff --git a/source/de/anomic/kelondro/text/ReferenceContainer.java b/source/de/anomic/kelondro/text/ReferenceContainer.java index 299d9011b..b193cce04 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainer.java +++ b/source/de/anomic/kelondro/text/ReferenceContainer.java @@ -37,6 +37,7 @@ import java.util.TreeMap; import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.order.Base64Order; +import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; import de.anomic.kelondro.util.ByteBuffer; @@ -380,6 +381,7 @@ public class ReferenceContainer extends RowSet assert (keylength == i2.rowdef.width(0)); final ReferenceContainer conj = new ReferenceContainer(factory, null, i1.rowdef, 0); // start with empty search result if (!((i1.rowdef.getOrdering().signature().equals(i2.rowdef.getOrdering().signature())))) return conj; // ordering must be equal + ByteOrder ordering = i1.rowdef.getOrdering(); final Iterator e1 = i1.entries(); final Iterator e2 = i2.entries(); int c; @@ -392,7 +394,7 @@ public class ReferenceContainer extends RowSet while (true) { assert (ie1.metadataHash().length() == keylength) : "ie1.urlHash() = " + ie1.metadataHash(); assert (ie2.metadataHash().length() == keylength) : "ie2.urlHash() = " + ie2.metadataHash(); - c = i1.rowdef.getOrdering().compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes()); + c = ordering.compare(ie1.metadataHash().getBytes(), ie2.metadataHash().getBytes()); //System.out.println("** '" + ie1.getUrlHash() + "'.compareTo('" + ie2.getUrlHash() + "')="+c); if (c < 0) { if (e1.hasNext()) ie1 = e1.next(); else break; diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index e3b21f296..255bbd717 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -203,14 +203,14 @@ public final class ReferenceContainerArray { int k = 1; ReferenceContainer c = new ReferenceContainer(this.factory, termHash, RowSet.importRowSet(a, payloadrow)); if (System.currentTimeMillis() > timeout) { - Log.logWarning("ReferenceContainerArray", "timout in index retrieval: " + k + " tables searched. timeout = 1000"); + Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 1000"); return c; } while (entries.hasNext()) { c = c.merge(new ReferenceContainer(this.factory, termHash, RowSet.importRowSet(entries.next(), payloadrow))); k++; if (System.currentTimeMillis() > timeout) { - Log.logWarning("ReferenceContainerArray", "timout in index retrieval: " + k + " tables searched. timeout = 1000"); + Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 1000"); return c; } } diff --git a/source/de/anomic/kelondro/util/ByteBuffer.java b/source/de/anomic/kelondro/util/ByteBuffer.java index be068424e..abc5950f1 100644 --- a/source/de/anomic/kelondro/util/ByteBuffer.java +++ b/source/de/anomic/kelondro/util/ByteBuffer.java @@ -178,7 +178,7 @@ public final class ByteBuffer extends OutputStream { } public ByteBuffer append(final byte[] bb) { - write(bb); + write(bb, 0, bb.length); return this; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4277ea465..86c03b1a1 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1855,7 +1855,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch