- better selection of files to be merged

- fix for getChannel().close(), which works on windows but not on macs and linux

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5761 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent d39a5b42ca
commit 9da69d6b68

@ -172,14 +172,17 @@ public class BLOBArray implements BLOB {
return f;
}
public synchronized File[] unmountBestMatch(double maxq) {
double l, r, min = Double.MAX_VALUE;
public synchronized File[] unmountBestMatch(double maxq, long maxResultSize) {
long l, r;
double min = Double.MAX_VALUE;
int[] idx = new int[2];
maxResultSize = maxResultSize >> 1;
for (int i = 0; i < this.blobs.size() - 1; i++) {
for (int j = i + 1; j < this.blobs.size(); j++) {
l = this.blobs.get(i).location.length();
r = this.blobs.get(j).location.length();
double q = Math.max(l/r, r/l);
l = 1 + (this.blobs.get(i).location.length() >> 1);
r = 1 + (this.blobs.get(j).location.length() >> 1);
if (l + r > maxResultSize) continue;
double q = Math.max(((double) l)/((double) r), ((double) r)/((double) l));
if (q < min) {
min = q;
idx[0] = i;
@ -194,17 +197,34 @@ public class BLOBArray implements BLOB {
return bestmatch;
}
public synchronized File[] unmountSmallest(long maxResultSize) {
File f0 = smallestBLOB(null);
if (f0 == null) return null;
File f1 = smallestBLOB(f0);
if (f1 == null) return null;
unmountBLOB(f0, false);
unmountBLOB(f1, false);
return new File[]{f0, f1};
}
public synchronized File unmountSmallestBLOB() {
return smallestBLOB(null);
}
public synchronized File smallestBLOB(File excluding) {
if (this.blobs.size() == 0) return null;
int bestIndex = -1;
long smallest = Long.MAX_VALUE;
for (int i = 0; i < this.blobs.size(); i++) {
if (excluding != null && this.blobs.get(i).location.getAbsolutePath().equals(excluding.getAbsoluteFile())) continue;
if (this.blobs.get(i).location.length() < smallest) {
smallest = this.blobs.get(i).location.length();
bestIndex = i;
}
}
return unmount(bestIndex);
if (bestIndex == -1) return null;
return this.blobs.get(bestIndex).location;
}
public synchronized File unmountOldestBLOB(boolean smallestFromFirst2) {

@ -114,7 +114,9 @@ public final class HeapWriter {
protected static String fingerprintFileHash(File f) {
assert f != null;
return Digest.fastFingerprintB64(f, false).substring(0, 12);
String fp = Digest.fastFingerprintB64(f, false);
assert fp != null : "file = " + f.toString();
return fp.substring(0, 12);
}
public static void deleteAllFingerprints(File f) {

@ -127,7 +127,7 @@ public final class CachedRandomAccess extends AbstractRandomAccess implements Ra
public synchronized void close() {
if (RAFile != null) try {
RAFile.getChannel().close();
try{RAFile.getChannel().close();} catch (IOException e) {}
//System.out.println("***DEBUG*** closed file " + this.file + ", FD is " + ((RAFile.getFD().valid()) ? "VALID" : "VOID") + ", channel is " + ((RAFile.getChannel().isOpen()) ? "OPEN" : "CLOSE"));
RAFile.close();
//System.out.println("***DEBUG*** closed file " + this.file + ", FD is " + ((RAFile.getFD().valid()) ? "VALID" : "VOID") + ", channel is " + ((RAFile.getChannel().isOpen()) ? "OPEN" : "CLOSE"));

@ -233,8 +233,12 @@ public class Digest {
public static String fastFingerprintB64(final File file, boolean includeDate) {
try {
return Base64Order.enhancedCoder.encode(fastFingerprintRaw(file, includeDate));
byte[] b = fastFingerprintRaw(file, includeDate);
assert b != null : "file = " + file.toString();
assert b.length != 0 : "file = " + file.toString();
return Base64Order.enhancedCoder.encode(b);
} catch (IOException e) {
e.printStackTrace();
return null;
}
}
@ -280,7 +284,7 @@ public class Digest {
if (includeDate) digest.update(NaturalOrder.encodeLong(file.lastModified(), 8), 0, 8);
} finally {
raf.close();
raf.getChannel().close();
try {raf.getChannel().close();} catch (IOException e) {}
}
return digest.digest();
}

@ -56,9 +56,10 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
// class variables
private final ReferenceContainerArray array;
private ReferenceContainerCache ram;
private int maxRamEntries, maxArrayFiles;
private int maxRamEntries;
private final IODispatcher merger;
private final long lastCleanup;
private long lastCleanup;
private final long targetFileSize, maxFileSize;
public IndexCell(
@ -66,16 +67,18 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
final ByteOrder wordOrder,
final Row payloadrow,
final int maxRamEntries,
final int maxArrayFiles,
final long targetFileSize,
final long maxFileSize,
IODispatcher merger
) throws IOException {
this.array = new ReferenceContainerArray(cellPath, wordOrder, payloadrow, merger);
this.ram = new ReferenceContainerCache(payloadrow, wordOrder);
this.ram.initWriteMode();
this.maxRamEntries = maxRamEntries;
this.maxArrayFiles = maxArrayFiles;
this.merger = merger;
this.lastCleanup = System.currentTimeMillis();
this.targetFileSize = targetFileSize;
this.maxFileSize = maxFileSize;
}
@ -281,9 +284,8 @@ public final class IndexCell extends AbstractBufferedIndex implements BufferedIn
private synchronized void cacheCleanup() throws IOException {
if (this.lastCleanup + cleanupCycle > System.currentTimeMillis()) return;
if (this.array.entries() > this.maxArrayFiles) {
this.array.shrink(true);
}
this.array.shrink(this.targetFileSize, this.maxFileSize);
this.lastCleanup = System.currentTimeMillis();
}
public File newContainerBLOBFile() {

@ -56,7 +56,8 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
final ByteOrder wordOrdering,
final Row payloadrow,
final int entityCacheMaxSize,
final int maxCellArrayFiles,
final long targetFileSize,
final long maxFileSize,
final IODispatcher merger,
final Log log) throws IOException {
@ -66,7 +67,10 @@ public final class IndexCollectionMigration extends AbstractBufferedIndex implem
celldir,
wordOrdering,
ReferenceRow.urlEntryRow,
entityCacheMaxSize, maxCellArrayFiles, this.merger);
entityCacheMaxSize,
targetFileSize,
maxFileSize,
this.merger);
final File textindexcache = new File(indexPrimaryTextLocation, "RICACHE");
if (textindexcache.exists()) {
// migrate the "index.dhtout.blob" into RICELL directory

@ -36,7 +36,6 @@ import de.anomic.kelondro.index.Row;
import de.anomic.kelondro.index.RowSet;
import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.order.CloneableIterator;
import de.anomic.kelondro.util.FileUtils;
public final class ReferenceContainerArray {
@ -245,34 +244,29 @@ public final class ReferenceContainerArray {
return this.array.entries();
}
public synchronized boolean shrink(boolean similar) throws IOException {
public synchronized boolean shrink(long targetFileSize, long maxFileSize) throws IOException {
if (this.array.entries() < 2) return false;
if (this.merger.queueLength() > 0) return false;
File[] ff = this.array.unmountBestMatch(2.0);
if (ff == null) {
ff = new File[2];
ff[0] = this.array.unmountSmallestBLOB();
if (ff[0].length() == 0) {
FileUtils.deletedelete(ff[0]);
return true;
}
ff[1] = this.array.unmountSmallestBLOB();
if (ff[1].length() == 0) {
this.array.mountBLOB(ff[0]);
FileUtils.deletedelete(ff[1]);
return true;
}
/*
ff[0] = this.array.unmountOldestBLOB(similar);
if (ff[0].length() == 0) {
FileUtils.deletedelete(ff[0]);
return true;
}
ff[1] = (similar) ? this.array.unmountSimilarSizeBLOB(ff[0].length()) : this.array.unmountOldestBLOB(false);
*/
File[] ff = this.array.unmountBestMatch(2.0, targetFileSize);
if (ff != null) {
merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile());
return true;
}
ff = this.array.unmountSmallest(targetFileSize);
if (ff != null) {
merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile());
return true;
}
merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile());
return true;
ff = this.array.unmountBestMatch(2.0, maxFileSize);
if (ff != null) {
merger.merge(ff[0], ff[1], this.array, this.payloadrow, newContainerBLOBFile());
return true;
}
return false;
}

@ -69,7 +69,8 @@ public final class plasmaWordIndex {
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7
public static final int maxCellArrayFiles = 10;
public static final long targetFileSize = 100 * 1024 * 1024;
public static final long maxFileSize = Long.MAX_VALUE >> 1;
public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote";
@ -146,7 +147,8 @@ public final class plasmaWordIndex {
wordOrder,
ReferenceRow.urlEntryRow,
entityCacheMaxSize,
maxCellArrayFiles,
targetFileSize,
maxFileSize,
this.merger,
log)
:
@ -166,7 +168,8 @@ public final class plasmaWordIndex {
wordOrder,
ReferenceRow.urlEntryRow,
entityCacheMaxSize,
maxCellArrayFiles,
targetFileSize,
maxFileSize,
this.merger);
}

@ -321,7 +321,7 @@ public class mediawikiIndex {
} finally {
if (raf != null) try {
raf.close();
raf.getChannel().close();
try{raf.getChannel().close();} catch (IOException e) {}
} catch (IOException e) { }
}
return b;

Loading…
Cancel
Save