- migration of files from SplitTable (which are used for the URL-DB) to a different file name format.

- the file generation logic is slightly different: files may now have only a maximum size of one gigabyte and a maximum age of one month.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5773 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 1f37cc6107
commit f6691411b5

@ -28,13 +28,14 @@ package de.anomic.kelondro.table;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.text.ParseException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Calendar;
import java.util.Date; import java.util.Date;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Random;
import java.util.concurrent.Callable; import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService; import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutionException;
@ -46,6 +47,7 @@ import java.util.concurrent.RejectedExecutionException;
import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeUnit;
import de.anomic.kelondro.blob.BLOBArray;
import de.anomic.kelondro.blob.Cache; import de.anomic.kelondro.blob.Cache;
import de.anomic.kelondro.index.Column; import de.anomic.kelondro.index.Column;
import de.anomic.kelondro.index.ObjectIndexCache; import de.anomic.kelondro.index.ObjectIndexCache;
@ -57,6 +59,7 @@ import de.anomic.kelondro.order.NaturalOrder;
import de.anomic.kelondro.order.MergeIterator; import de.anomic.kelondro.order.MergeIterator;
import de.anomic.kelondro.order.Order; import de.anomic.kelondro.order.Order;
import de.anomic.kelondro.order.StackIterator; import de.anomic.kelondro.order.StackIterator;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils; import de.anomic.kelondro.util.FileUtils;
import de.anomic.kelondro.util.Log; import de.anomic.kelondro.util.Log;
import de.anomic.kelondro.util.NamePrefixThreadFactory; import de.anomic.kelondro.util.NamePrefixThreadFactory;
@ -76,18 +79,42 @@ public class SplitTable implements ObjectIndex {
private HashMap<String, ObjectIndex> tables; // a map from a date string to a kelondroIndex object private HashMap<String, ObjectIndex> tables; // a map from a date string to a kelondroIndex object
private final Row rowdef; private final Row rowdef;
private final File path; private final File path;
private final String tablename; private final String prefix;
private final Order<Row.Entry> entryOrder; private final Order<Row.Entry> entryOrder;
private String current;
private long fileAgeLimit;
private long fileSizeLimit;
public SplitTable(final File path, final String tablename, final Row rowdef, final boolean resetOnFail) { public SplitTable(
final File path,
final String tablename,
final Row rowdef,
final boolean resetOnFail) {
this(path, tablename, rowdef, BLOBArray.oneMonth, BLOBArray.oneGigabyte, resetOnFail);
}
public SplitTable(
final File path,
final String tablename,
final Row rowdef,
final long fileAgeLimit,
final long fileSizeLimit,
final boolean resetOnFail) {
this.path = path; this.path = path;
this.tablename = tablename; this.prefix = tablename;
this.rowdef = rowdef; this.rowdef = rowdef;
this.fileAgeLimit = fileAgeLimit;
this.fileSizeLimit = fileSizeLimit;
this.entryOrder = new Row.EntryComparator(rowdef.objectOrder); this.entryOrder = new Row.EntryComparator(rowdef.objectOrder);
init(resetOnFail); init(resetOnFail);
} }
String newFilename() {
return prefix + "." + DateFormatter.formatShortMilliSecond(new Date()) + ".table";
}
public void init(final boolean resetOnFail) { public void init(final boolean resetOnFail) {
current = null;
// init the thread pool for the keeperOf executor service // init the thread pool for the keeperOf executor service
this.executor = new ThreadPoolExecutor( this.executor = new ThreadPoolExecutor(
@ -95,23 +122,48 @@ public class SplitTable implements ObjectIndex {
Runtime.getRuntime().availableProcessors() + 1, 10, Runtime.getRuntime().availableProcessors() + 1, 10,
TimeUnit.SECONDS, TimeUnit.SECONDS,
new LinkedBlockingQueue<Runnable>(), new LinkedBlockingQueue<Runnable>(),
new NamePrefixThreadFactory(tablename)); new NamePrefixThreadFactory(prefix));
// initialized tables map // initialized tables map
this.tables = new HashMap<String, ObjectIndex>(); this.tables = new HashMap<String, ObjectIndex>();
if (!(path.exists())) path.mkdirs(); if (!(path.exists())) path.mkdirs();
final String[] tablefile = path.list(); String[] tablefile = path.list();
String date;
// zero pass: migrate old table names
File f;
Random r = new Random(System.currentTimeMillis());
for (int i = 0; i < tablefile.length; i++) {
if ((tablefile[i].startsWith(prefix)) &&
(tablefile[i].charAt(prefix.length()) == '.') &&
(tablefile[i].length() == prefix.length() + 7)) {
f = new File(path, tablefile[i]);
String newname = tablefile[i] + "0100000" + (Long.toString(r.nextLong())+"00000").substring(1,5) + ".table";
f.renameTo(new File(path, newname));
}
}
tablefile = path.list();
// first pass: find tables // first pass: find tables
final HashMap<String, Long> t = new HashMap<String, Long>(); final HashMap<String, Long> t = new HashMap<String, Long>();
long ram, sum = 0; long ram, sum = 0, time, maxtime = 0;
File f; Date d;
for (int i = 0; i < tablefile.length; i++) { for (int i = 0; i < tablefile.length; i++) {
if ((tablefile[i].startsWith(tablename)) && if ((tablefile[i].startsWith(prefix)) &&
(tablefile[i].charAt(tablename.length()) == '.') && (tablefile[i].charAt(prefix.length()) == '.') &&
(tablefile[i].length() == tablename.length() + 7)) { (tablefile[i].length() == prefix.length() + 24)) {
f = new File(path, tablefile[i]); f = new File(path, tablefile[i]);
try {
d = DateFormatter.parseShortMilliSecond(tablefile[i].substring(prefix.length() + 1, prefix.length() + 18));
} catch (ParseException e) {
e.printStackTrace();
continue;
}
time = d.getTime();
if (time > maxtime) {
current = tablefile[i];
maxtime = time;
}
if (f.isDirectory()) { if (f.isDirectory()) {
ram = FlexTable.staticRAMIndexNeed(path, tablefile[i], rowdef); ram = FlexTable.staticRAMIndexNeed(path, tablefile[i], rowdef);
} else { } else {
@ -146,8 +198,7 @@ public class SplitTable implements ObjectIndex {
// open next biggest table // open next biggest table
t.remove(maxf); t.remove(maxf);
if(maxf != null) { if (maxf != null) {
date = maxf.substring(tablename.length() + 1);
f = new File(path, maxf); f = new File(path, maxf);
if (f.isDirectory()) { if (f.isDirectory()) {
// this is a kelonodroFlex table // this is a kelonodroFlex table
@ -156,7 +207,7 @@ public class SplitTable implements ObjectIndex {
} }
Log.logInfo("kelondroSplitTable", "opening partial eco table " + f); Log.logInfo("kelondroSplitTable", "opening partial eco table " + f);
table = new EcoTable(f, rowdef, EcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0); table = new EcoTable(f, rowdef, EcoTable.tailCacheUsageAuto, EcoFSBufferSize, 0);
tables.put(date, table); tables.put(maxf, table);
} }
} }
} }
@ -165,7 +216,7 @@ public class SplitTable implements ObjectIndex {
this.close(); this.close();
final String[] l = path.list(); final String[] l = path.list();
for (int i = 0; i < l.length; i++) { for (int i = 0; i < l.length; i++) {
if (l[i].startsWith(tablename)) { if (l[i].startsWith(prefix)) {
final File f = new File(path, l[i]); final File f = new File(path, l[i]);
if (f.isDirectory()) FlexWidthArray.delete(path, l[i]); else FileUtils.deletedelete(f); if (f.isDirectory()) FlexWidthArray.delete(path, l[i]); else FileUtils.deletedelete(f);
} }
@ -174,24 +225,7 @@ public class SplitTable implements ObjectIndex {
} }
public String filename() { public String filename() {
return new File(path, tablename).toString(); return new File(path, prefix).toString();
}
private static final Calendar thisCalendar = Calendar.getInstance();
public static final String dateSuffix(final Date date) {
int month, year;
final StringBuilder suffix = new StringBuilder(6);
synchronized (thisCalendar) {
thisCalendar.setTime(date);
month = thisCalendar.get(Calendar.MONTH) + 1;
year = thisCalendar.get(Calendar.YEAR);
}
if ((year < 1970) && (year >= 70)) suffix.append("19").append(Integer.toString(year));
else if (year < 1970) suffix.append("20").append(Integer.toString(year));
else if (year > 3000) return null;
else suffix.append(Integer.toString(year));
if (month < 10) suffix.append("0").append(Integer.toString(month)); else suffix.append(Integer.toString(month));
return new String(suffix);
} }
public int size() { public int size() {
@ -223,61 +257,50 @@ public class SplitTable implements ObjectIndex {
return keeper.get(key); return keeper.get(key);
} }
private ObjectIndex newTable() {
this.current = newFilename();
final File f = new File(path, this.current);
EcoTable table = new EcoTable(f, rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
tables.put(this.current, table);
return table;
}
private ObjectIndex checkTable(ObjectIndex table) {
// check size and age of given table; in case it is too large or too old
// create a new table
String name = new File(table.filename()).getName();
long d;
try {
d = DateFormatter.parseShortMilliSecond(name.substring(prefix.length() + 1, prefix.length() + 18)).getTime();
} catch (ParseException e) {
e.printStackTrace();
d = 0;
}
if (d + this.fileAgeLimit < System.currentTimeMillis() || new File(this.path, name).length() >= this.fileSizeLimit) {
return newTable();
}
return table;
}
public synchronized void put(final List<Row.Entry> rows) throws IOException { public synchronized void put(final List<Row.Entry> rows) throws IOException {
throw new UnsupportedOperationException("not yet implemented"); for (Row.Entry entry: rows) put(entry);
} }
public synchronized Row.Entry replace(final Row.Entry row) throws IOException { public synchronized Row.Entry replace(final Row.Entry row) throws IOException {
assert row.objectsize() <= this.rowdef.objectsize; assert row.objectsize() <= this.rowdef.objectsize;
final ObjectIndex keeper = keeperOf(row.getColBytes(0)); ObjectIndex keeper = keeperOf(row.getColBytes(0));
if (keeper != null) return keeper.replace(row); if (keeper != null) return keeper.replace(row);
Date entryDate = new Date(); keeper = (this.current == null) ? newTable() : checkTable(this.tables.get(this.current));
final String suffix = dateSuffix(entryDate); keeper.put(row);
if (suffix == null) return null;
ObjectIndex table = tables.get(suffix);
if (table == null) {
// open table
final File f = new File(path, tablename + "." + suffix);
if (f.exists()) {
if (f.isDirectory()) {
FlexTable.delete(path, tablename + "." + suffix);
}
// open a eco table
table = new EcoTable(f, rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
} else {
// make new table
table = new EcoTable(f, rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
}
tables.put(suffix, table);
}
table.put(row);
return null; return null;
} }
public synchronized void put(final Row.Entry row) throws IOException { public synchronized void put(final Row.Entry row) throws IOException {
assert row.objectsize() <= this.rowdef.objectsize; assert row.objectsize() <= this.rowdef.objectsize;
final ObjectIndex keeper = keeperOf(row.getColBytes(0)); ObjectIndex keeper = keeperOf(row.getColBytes(0));
if (keeper != null) {keeper.put(row); return;} if (keeper != null) {keeper.put(row); return;}
Date entryDate = new Date(); keeper = (this.current == null) ? newTable() : checkTable(this.tables.get(this.current));
final String suffix = dateSuffix(entryDate); keeper.put(row);
if (suffix == null) return;
ObjectIndex table = tables.get(suffix);
if (table == null) {
// open table
final File f = new File(path, tablename + "." + suffix);
if (f.exists()) {
if (f.isDirectory()) {
FlexTable.delete(path, tablename + "." + suffix);
}
// open a eco table
table = new EcoTable(f, rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
} else {
// make new table
table = new EcoTable(f, rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
}
tables.put(suffix, table);
}
table.put(row);
} }
public synchronized ObjectIndex keeperOf(final byte[] key) { public synchronized ObjectIndex keeperOf(final byte[] key) {
@ -329,15 +352,8 @@ public class SplitTable implements ObjectIndex {
public synchronized void addUnique(final Row.Entry row) throws IOException { public synchronized void addUnique(final Row.Entry row) throws IOException {
assert row.objectsize() <= this.rowdef.objectsize; assert row.objectsize() <= this.rowdef.objectsize;
Date entryDate = new Date(); ObjectIndex table = (this.current == null) ? null : tables.get(this.current);
final String suffix = dateSuffix(entryDate); if (table == null) table = newTable(); else table = checkTable(table);
if (suffix == null) return;
ObjectIndex table = tables.get(suffix);
if (table == null) {
// make new table
table = new EcoTable(new File(path, tablename + "." + suffix), rowdef, EcoTable.tailCacheDenyUsage, EcoFSBufferSize, 0);
tables.put(suffix, table);
}
table.addUnique(row); table.addUnique(row);
} }
@ -401,26 +417,6 @@ public class SplitTable implements ObjectIndex {
} }
return StackIterator.stack(c); return StackIterator.stack(c);
} }
public final int cacheObjectChunkSize() {
// dummy method
return -1;
}
public long[] cacheObjectStatus() {
// dummy method
return null;
}
public final int cacheNodeChunkSize() {
// returns the size that the node cache uses for a single entry
return -1;
}
public final int[] cacheNodeStatus() {
// a collection of different node cache status values
return new int[]{0,0,0,0,0,0,0,0,0,0};
}
public synchronized void close() { public synchronized void close() {
if (tables == null) return; if (tables == null) return;
@ -436,10 +432,6 @@ public class SplitTable implements ObjectIndex {
} }
this.tables = null; this.tables = null;
} }
public static void main(final String[] args) {
System.out.println(dateSuffix(new Date()));
}
public void deleteOnExit() { public void deleteOnExit() {
for (ObjectIndex i: this.tables.values()) i.deleteOnExit(); for (ObjectIndex i: this.tables.values()) i.deleteOnExit();

@ -40,6 +40,7 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.IndexingStack; import de.anomic.crawler.IndexingStack;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpdProxyCacheEntry; import de.anomic.http.httpdProxyCacheEntry;
import de.anomic.kelondro.blob.BLOBArray;
import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.ByteOrder; import de.anomic.kelondro.order.ByteOrder;
import de.anomic.kelondro.text.BufferedIndex; import de.anomic.kelondro.text.BufferedIndex;
@ -69,8 +70,8 @@ public final class plasmaWordIndex {
public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash public static final int wCacheMaxChunk = 800; // maximum number of references for each urlhash
public static final int lowcachedivisor = 900; public static final int lowcachedivisor = 900;
public static final int maxCollectionPartition = 7; // should be 7 public static final int maxCollectionPartition = 7; // should be 7
public static final long targetFileSize = 100 * 1024 * 1024; public static final long targetFileSize = 100 * 1024 * 1024; // 100 MB
public static final long maxFileSize = Long.MAX_VALUE >> 1; public static final long maxFileSize = BLOBArray.oneGigabyte; // 1GB
public static final String CRAWL_PROFILE_PROXY = "proxy"; public static final String CRAWL_PROFILE_PROXY = "proxy";
public static final String CRAWL_PROFILE_REMOTE = "remote"; public static final String CRAWL_PROFILE_REMOTE = "remote";

Loading…
Cancel
Save