diff --git a/source/de/anomic/kelondro/blob/BLOBArray.java b/source/de/anomic/kelondro/blob/BLOBArray.java index 8244001b3..61b8c2a9b 100755 --- a/source/de/anomic/kelondro/blob/BLOBArray.java +++ b/source/de/anomic/kelondro/blob/BLOBArray.java @@ -172,14 +172,17 @@ public class BLOBArray implements BLOB { return f; } - public synchronized File[] unmountBestMatch(double maxq) { - double l, r, min = Double.MAX_VALUE; + public synchronized File[] unmountBestMatch(double maxq, long maxResultSize) { + long l, r; + double min = Double.MAX_VALUE; int[] idx = new int[2]; + maxResultSize = maxResultSize >> 1; for (int i = 0; i < this.blobs.size() - 1; i++) { for (int j = i + 1; j < this.blobs.size(); j++) { - l = this.blobs.get(i).location.length(); - r = this.blobs.get(j).location.length(); - double q = Math.max(l/r, r/l); + l = 1 + (this.blobs.get(i).location.length() >> 1); + r = 1 + (this.blobs.get(j).location.length() >> 1); + if (l + r > maxResultSize) continue; + double q = Math.max(((double) l)/((double) r), ((double) r)/((double) l)); if (q < min) { min = q; idx[0] = i; @@ -194,17 +197,34 @@ public class BLOBArray implements BLOB { return bestmatch; } + public synchronized File[] unmountSmallest(long maxResultSize) { + File f0 = smallestBLOB(null); + if (f0 == null) return null; + File f1 = smallestBLOB(f0); + if (f1 == null) return null; + + unmountBLOB(f0, false); + unmountBLOB(f1, false); + return new File[]{f0, f1}; + } + public synchronized File unmountSmallestBLOB() { + return smallestBLOB(null); + } + + public synchronized File smallestBLOB(File excluding) { if (this.blobs.size() == 0) return null; int bestIndex = -1; long smallest = Long.MAX_VALUE; for (int i = 0; i < this.blobs.size(); i++) { + if (excluding != null && this.blobs.get(i).location.getAbsolutePath().equals(excluding.getAbsoluteFile())) continue; if (this.blobs.get(i).location.length() < smallest) { smallest = this.blobs.get(i).location.length(); bestIndex = i; } } - return unmount(bestIndex); + if (bestIndex == -1) return null; + return this.blobs.get(bestIndex).location; } public synchronized File unmountOldestBLOB(boolean smallestFromFirst2) { diff --git a/source/de/anomic/kelondro/blob/HeapWriter.java b/source/de/anomic/kelondro/blob/HeapWriter.java index 44a6b08d9..44e6eaac4 100644 --- a/source/de/anomic/kelondro/blob/HeapWriter.java +++ b/source/de/anomic/kelondro/blob/HeapWriter.java @@ -114,7 +114,9 @@ public final class HeapWriter { protected static String fingerprintFileHash(File f) { assert f != null; - return Digest.fastFingerprintB64(f, false).substring(0, 12); + String fp = Digest.fastFingerprintB64(f, false); + assert fp != null : "file = " + f.toString(); + return fp.substring(0, 12); } public static void deleteAllFingerprints(File f) { diff --git a/source/de/anomic/kelondro/io/CachedRandomAccess.java b/source/de/anomic/kelondro/io/CachedRandomAccess.java index 3fd65b1b6..16f641f63 100644 --- a/source/de/anomic/kelondro/io/CachedRandomAccess.java +++ b/source/de/anomic/kelondro/io/CachedRandomAccess.java @@ -127,7 +127,7 @@ public final class CachedRandomAccess extends AbstractRandomAccess implements Ra public synchronized void close() { if (RAFile != null) try { - RAFile.getChannel().close(); + try{RAFile.getChannel().close();} catch (IOException e) {} //System.out.println("***DEBUG*** closed file " + this.file + ", FD is " + ((RAFile.getFD().valid()) ? "VALID" : "VOID") + ", channel is " + ((RAFile.getChannel().isOpen()) ? "OPEN" : "CLOSE")); RAFile.close(); //System.out.println("***DEBUG*** closed file " + this.file + ", FD is " + ((RAFile.getFD().valid()) ? "VALID" : "VOID") + ", channel is " + ((RAFile.getChannel().isOpen()) ? "OPEN" : "CLOSE")); diff --git a/source/de/anomic/kelondro/order/Digest.java b/source/de/anomic/kelondro/order/Digest.java index 2406f90a9..daf7436ea 100644 --- a/source/de/anomic/kelondro/order/Digest.java +++ b/source/de/anomic/kelondro/order/Digest.java @@ -233,8 +233,12 @@ public class Digest { public static String fastFingerprintB64(final File file, boolean includeDate) { try { - return Base64Order.enhancedCoder.encode(fastFingerprintRaw(file, includeDate)); + byte[] b = fastFingerprintRaw(file, includeDate); + assert b != null : "file = " + file.toString(); + assert b.length != 0 : "file = " + file.toString(); + return Base64Order.enhancedCoder.encode(b); } catch (IOException e) { + e.printStackTrace(); return null; } } @@ -280,7 +284,7 @@ public class Digest { if (includeDate) digest.update(NaturalOrder.encodeLong(file.lastModified(), 8), 0, 8); } finally { raf.close(); - raf.getChannel().close(); + try {raf.getChannel().close();} catch (IOException e) {} } return digest.digest(); } diff --git a/source/de/anomic/kelondro/text/IndexCell.java b/source/de/anomic/kelondro/text/IndexCell.java index 9b4e3cf00..13ba03915 100644 --- a/source/de/anomic/kelondro/text/IndexCell.java +++ b/source/de/anomic/kelondro/text/IndexCell.java @@ -56,9 +56,10 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn // class variables private final ReferenceContainerArray array; private ReferenceContainerCache ram; - private int maxRamEntries, maxArrayFiles; + private int maxRamEntries; private final IODispatcher merger; - private final long lastCleanup; + private long lastCleanup; + private final long targetFileSize, maxFileSize; public IndexCell( @@ -66,16 +67,18 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn final ByteOrder wordOrder, final Row payloadrow, final int maxRamEntries, - final int maxArrayFiles, + final long targetFileSize, + final long maxFileSize, IODispatcher merger ) throws IOException { this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow, merger); this.ram = new ReferenceContainerCache(payloadrow, wordOrder); this.ram.initWriteMode(); this.maxRamEntries = maxRamEntries; - this.maxArrayFiles = maxArrayFiles; this.merger = merger; this.lastCleanup = System.currentTimeMillis(); + this.targetFileSize = targetFileSize; + this.maxFileSize = maxFileSize; } @@ -281,9 +284,8 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn private synchronized void cacheCleanup() throws IOException { if (this.lastCleanup + cleanupCycle > System.currentTimeMillis()) return; - if (this.array.entries() > this.maxArrayFiles) { - this.array.shrink(true); - } + this.array.shrink(this.targetFileSize, this.maxFileSize); + this.lastCleanup = System.currentTimeMillis(); } public File newContainerBLOBFile() { diff --git a/source/de/anomic/kelondro/text/IndexCollectionMigration.java b/source/de/anomic/kelondro/text/IndexCollectionMigration.java index 172bad75d..5dba8352b 100644 --- a/source/de/anomic/kelondro/text/IndexCollectionMigration.java +++ b/source/de/anomic/kelondro/text/IndexCollectionMigration.java @@ -56,7 +56,8 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem final ByteOrder wordOrdering, final Row payloadrow, final int entityCacheMaxSize, - final int maxCellArrayFiles, + final long targetFileSize, + final long maxFileSize, final IODispatcher merger, final Log log) throws IOException { @@ -66,7 +67,10 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem celldir, wordOrdering, ReferenceRow.urlEntryRow, - entityCacheMaxSize, maxCellArrayFiles, this.merger); + entityCacheMaxSize, + targetFileSize, + maxFileSize, + this.merger); final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE"); if (textindexcache.exists()) { // migrate the "index.dhtout.blob" into RICELL directory diff --git a/source/de/anomic/kelondro/text/ReferenceContainerArray.java b/source/de/anomic/kelondro/text/ReferenceContainerArray.java index 131f3a5e8..f55177d50 100644 --- a/source/de/anomic/kelondro/text/ReferenceContainerArray.java +++ b/source/de/anomic/kelondro/text/ReferenceContainerArray.java @@ -36,7 +36,6 @@ import de.anomic.kelondro.index.Row; import de.anomic.kelondro.index.RowSet; import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.CloneableIterator; -import de.anomic.kelondro.util.FileUtils; public final class ReferenceContainerArray { @@ -245,34 +244,29 @@ public final class ReferenceContainerArray { return this.array.entries(); } - public synchronized boolean shrink(boolean similar) throws IOException { + public synchronized boolean shrink(long targetFileSize, long maxFileSize) throws IOException { if (this.array.entries() < 2) return false; if (this.merger.queueLength() > 0) return false; - File[] ff = this.array.unmountBestMatch(2.0); - if (ff == null) { - ff = new File[2]; - ff[0] = this.array.unmountSmallestBLOB(); - if (ff[0].length() == 0) { - FileUtils.deletedelete(ff[0]); - return true; - } - ff[1] = this.array.unmountSmallestBLOB(); - if (ff[1].length() == 0) { - this.array.mountBLOB(ff[0]); - FileUtils.deletedelete(ff[1]); - return true; - } - /* - ff[0] = this.array.unmountOldestBLOB(similar); - if (ff[0].length() == 0) { - FileUtils.deletedelete(ff[0]); - return true; - } - ff[1] = (similar) ? this.array.unmountSimilarSizeBLOB(ff[0].length()) : this.array.unmountOldestBLOB(false); - */ + + File[] ff = this.array.unmountBestMatch(2.0, targetFileSize); + if (ff != null) { + merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile()); + return true; + } + + ff = this.array.unmountSmallest(targetFileSize); + if (ff != null) { + merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile()); + return true; } - merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile()); - return true; + + ff = this.array.unmountBestMatch(2.0, maxFileSize); + if (ff != null) { + merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile()); + return true; + } + + return false; } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f0c0b7d84..1b8af4914 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -69,7 +69,8 @@ public final class plasmaWordIndex { public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash public static final int lowcachedivisor = 900; public static final int maxCollectionPartition = 7; // should be 7 - public static final int maxCellArrayFiles = 10; + public static final long targetFileSize = 100 * 1024 * 1024; + public static final long maxFileSize = Long.MAX_VALUE >> 1; public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_REMOTE = "remote"; @@ -146,7 +147,8 @@ public final class plasmaWordIndex { wordOrder, ReferenceRow.urlEntryRow, entityCacheMaxSize, - maxCellArrayFiles, + targetFileSize, + maxFileSize, this.merger, log) : @@ -166,7 +168,8 @@ public final class plasmaWordIndex { wordOrder, ReferenceRow.urlEntryRow, entityCacheMaxSize, - maxCellArrayFiles, + targetFileSize, + maxFileSize, this.merger); } diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index 2c124dcdc..3738b213b 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -321,7 +321,7 @@ public class mediawikiIndex { } finally { if (raf != null) try { raf.close(); - raf.getChannel().close(); + try{raf.getChannel().close();} catch (IOException e) {} } catch (IOException e) { } } return b;