fine-tuning of rci-generation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1105 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 89fab9f200
commit 88e3234393

@ -1,3 +1,3 @@
cd `dirname $0`/..
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168

@ -1,2 +1,2 @@
cd `dirname $0`/..
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .
java -server -Xms1400m -Xmx1400m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .

@ -67,7 +67,7 @@ public class kelondroAttrSeq {
// class objects
private File file;
private TreeMap entries;
private Map entries;
private Structure structure;
private String name;
private long created;
@ -75,20 +75,21 @@ public class kelondroAttrSeq {
// optional logger
protected Logger theLogger = null;
public kelondroAttrSeq(File file) throws IOException {
public kelondroAttrSeq(File file, boolean tree) throws IOException {
this.file = file;
this.structure = null;
this.created = 0;
this.name = "";
this.entries = readAttrFile(file);
this.entries = (tree) ? (Map) new TreeMap() : (Map) new HashMap();
readAttrFile(file);
}
public kelondroAttrSeq(String name, String struct) {
public kelondroAttrSeq(String name, String struct, boolean tree) {
this.file = null;
this.structure = new Structure(struct);
this.created = System.currentTimeMillis();
this.name = name;
this.entries = new TreeMap();
this.entries = (tree) ? (Map) new TreeMap() : (Map) new HashMap();
}
public void setLogger(Logger newLogger) {
@ -109,8 +110,7 @@ public class kelondroAttrSeq {
this.theLogger.warning("ATTRSEQ WARNING for file " + this.file + ": " + message);
}
private TreeMap readAttrFile(File file) throws IOException {
TreeMap entries = new TreeMap();
private void readAttrFile(File file) throws IOException {
BufferedReader br = null;
int p;
if (file.toString().endsWith(".gz")) {
@ -145,8 +145,10 @@ public class kelondroAttrSeq {
}
}
br.close();
return entries;
}
public int size() {
return entries.size();
}
public long created() {
@ -162,12 +164,16 @@ public class kelondroAttrSeq {
sb.append("# ---"); sb.append((char) 13); sb.append((char) 10);
Iterator i = entries.entrySet().iterator();
Map.Entry entry;
String k,v;
String k;
Object v;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
k = (String) entry.getKey();
v = (String) entry.getValue();
sb.append(k); sb.append('='); sb.append(v); sb.append((char) 13); sb.append((char) 10);
v = entry.getValue();
sb.append(k); sb.append('=');
if (v instanceof String) sb.append((String) v);
if (v instanceof Entry) sb.append(((Entry) v).toString());
sb.append((char) 13); sb.append((char) 10);
}
if (out.toString().endsWith(".gz")) {
serverFileUtils.writeAndZip(sb.toString().getBytes(), out);
@ -188,24 +194,33 @@ public class kelondroAttrSeq {
return new Entry(pivot, props, seq);
}
public void addEntry(String pivot, String attrseq) {
/*
public void putEntry(String pivot, String attrseq) {
entries.put(pivot, attrseq);
}
*/
public void addEntry(Entry entry) {
entries.put(entry.pivot, entry.toString());
public void putEntry(Entry entry) {
if (shortmem())
entries.put(entry.pivot, entry.toString());
else
entries.put(entry.pivot, entry);
}
public Entry getEntry(String pivot) {
String struct = (String) entries.get(pivot);
if (struct == null) return null;
return new Entry(pivot, struct);
Object e = entries.get(pivot);
if (e == null) return null;
if (e instanceof String) return new Entry(pivot, (String) e);
if (e instanceof Entry) return (Entry) e;
return null;
}
public Entry removeEntry(String pivot) {
String struct = (String) entries.remove(pivot);
if (struct == null) return null;
return new Entry(pivot, struct);
Object e = entries.remove(pivot);
if (e == null) return null;
if (e instanceof String) return new Entry(pivot, (String) e);
if (e instanceof Entry) return (Entry) e;
return null;
}
public class Structure {
@ -285,7 +300,7 @@ public class kelondroAttrSeq {
}
public String toString() {
StringBuffer sb = new StringBuffer(70);
StringBuffer sb = new StringBuffer(100);
sb.append('<'); sb.append(pivot_name); sb.append('-'); sb.append(Integer.toString(pivot_len)); sb.append(">,'=',");
if (prop_names.length > 0) {
for (int i = 0; i < prop_names.length; i++) {
@ -318,8 +333,7 @@ public class kelondroAttrSeq {
}
int p = attrseq.indexOf('|');
attrseq = attrseq.substring(p + 1);
for (int i = 0; i < attrseq.length(); i = i + structure.seq_len) {
for (int i = p + 1; i < attrseq.length(); i = i + structure.seq_len) {
seq.add(attrseq.substring(i, i + structure.seq_len));
}
}
@ -351,7 +365,7 @@ public class kelondroAttrSeq {
public String toString() {
// creates only the attribute field and the sequence, not the pivot
StringBuffer sb = new StringBuffer(70);
StringBuffer sb = new StringBuffer(100 + structure.seq_len * seq.size());
Long val;
for (int i = 0; i < structure.prop_names.length; i++) {
val = (Long) attrs.get(structure.prop_names[i]);
@ -366,17 +380,27 @@ public class kelondroAttrSeq {
}
}
private static final Runtime runtime = Runtime.getRuntime();
private static final long cc = 0;
private static boolean shortmemstate = false;
private static boolean shortmem() {
if ((cc % 300) == 0) {
shortmemstate = (runtime.freeMemory() < 20000000L);
}
return shortmemstate;
}
public static void transcode(File from_file, File to_file) throws IOException {
kelondroAttrSeq crp = new kelondroAttrSeq(from_file);
kelondroAttrSeq crp = new kelondroAttrSeq(from_file, true);
//crp.toFile(new File(args[1]));
kelondroAttrSeq cro = new kelondroAttrSeq(crp.name + "/Transcoded from " + crp.file.getName(), crp.structure.toString());
kelondroAttrSeq cro = new kelondroAttrSeq(crp.name + "/Transcoded from " + crp.file.getName(), crp.structure.toString(), true);
Iterator i = crp.entries.keySet().iterator();
String key;
kelondroAttrSeq.Entry entry;
while (i.hasNext()) {
key = (String) i.next();
entry = crp.getEntry(key);
cro.addEntry(entry);
cro.putEntry(entry);
}
cro.toFile(to_file);
}

@ -67,7 +67,7 @@ public class plasmaRankingCRProcess {
// open file
kelondroAttrSeq source_cr = null;
try {
source_cr = new kelondroAttrSeq(f);
source_cr = new kelondroAttrSeq(f, false);
} catch (IOException e) {
return false;
}
@ -83,7 +83,7 @@ public class plasmaRankingCRProcess {
new_entry = source_cr.getEntry(key);
new_flags = new bitfield(serverCodings.enhancedCoder.encodeBase64Long((long) new_entry.getAttr("Flags", 0), 1).getBytes());
// enrich information with additional values
if ((acc_entry = acc.removeEntry(key)) != null) {
if ((acc_entry = acc.getEntry(key)) != null) {
FUDate = (int) acc_entry.getAttr("FUDate", 0);
FDDate = (int) acc_entry.getAttr("FDDate", 0);
LUDate = (int) acc_entry.getAttr("LUDate", 0);
@ -130,7 +130,7 @@ public class plasmaRankingCRProcess {
acc_entry.setAttr("ACount", (long) ACount);
acc_entry.setAttr("VCount", (long) VCount);
acc_entry.setAttr("Vita", (long) Vita);
acc.addEntry(acc_entry);
acc.putEntry(acc_entry);
}
return true;
@ -161,10 +161,10 @@ public class plasmaRankingCRProcess {
"<Referee-12>,'='," +
"<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>," +
"<FUDate-3>,<FDDate-3>,<LUDate-3>,<UCount-2>,<PCount-2>,<ACount-2>,<VCount-2>,<Vita-2>," +
"'|',*<Anchor-12>");
"'|',*<Anchor-12>", false);
acc.toFile(to_file);
}
acc = new kelondroAttrSeq(to_file);
acc = new kelondroAttrSeq(to_file, false);
// collect source files
kelondroAttrSeq source_cr = null;
@ -202,21 +202,25 @@ public class plasmaRankingCRProcess {
}
public static void genrci(File cr_in, File rci_out) throws IOException {
if (!(cr_in.exists())) return;
kelondroAttrSeq cr = new kelondroAttrSeq(cr_in);
kelondroAttrSeq rci;
public static int genrci(File cr_in, File rci_out) throws IOException {
if (!(cr_in.exists())) return 0;
final kelondroAttrSeq cr = new kelondroAttrSeq(cr_in, false);
if (rci_out.exists()) rci_out.delete(); // we want only fresh rci here (during testing)
if (!(rci_out.exists())) {
rci = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
kelondroAttrSeq rcix = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
"<AnchorDom-6>,'='," +
"<UDate-3>," +
"'|',*<Referee-12>");
rci.toFile(rci_out);
"'|',*<Referee-12>", false);
rcix.toFile(rci_out);
}
rci = new kelondroAttrSeq(rci_out);
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_out, false);
// loop over all referees
Iterator i = cr.keys();
int count = 0;
int size = cr.size();
long start = System.currentTimeMillis();
long l;
final Iterator i = cr.keys();
String referee, anchor, anchorDom;
kelondroAttrSeq.Entry cr_entry, rci_entry;
long cr_UDate, rci_UDate;
@ -233,7 +237,7 @@ public class plasmaRankingCRProcess {
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
// update domain-specific entry
rci_entry = rci.removeEntry(anchorDom);
rci_entry = rci.getEntry(anchorDom);
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom);
rci_entry.addSeq(referee);
@ -242,12 +246,18 @@ public class plasmaRankingCRProcess {
if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate);
// insert entry
rci.addEntry(rci_entry);
rci.putEntry(rci_entry);
}
count++;
if ((count % 1000) == 0) {
l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l)) + " seconds remaining; mem = " + Runtime.getRuntime().freeMemory());
}
}
// finished. write to file
rci.toFile(rci_out);
return count;
}
public static void main(String[] args) {
@ -270,7 +280,11 @@ public class plasmaRankingCRProcess {
if (!(acc_dir.exists())) acc_dir.mkdirs();
if (!(to_file.getParentFile().exists())) to_file.getParentFile().mkdirs();
serverFileUtils.moveAll(from_dir, ready_dir);
long start = System.currentTimeMillis();
int files = ready_dir.list().length;
accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
}
if ((args.length == 3) && (args[0].equals("-recycle"))) {
File root_path = new File(args[1]);
@ -282,12 +296,14 @@ public class plasmaRankingCRProcess {
if (!(acc_dir.exists())) return;
if (!(bkp_dir.exists())) bkp_dir.mkdirs();
String[] list = acc_dir.list();
long start = System.currentTimeMillis();
int files = list.length;
long d;
File f;
for (int i = 0; i < list.length; i++) {
f = new File(acc_dir, list[i]);
try {
d = (System.currentTimeMillis() - (new kelondroAttrSeq(f)).created()) / 3600000;
d = (System.currentTimeMillis() - (new kelondroAttrSeq(f, false)).created()) / 3600000;
if (d > max_age_hours) {
// file is considered to be too old, it is not recycled
System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup");
@ -304,13 +320,18 @@ public class plasmaRankingCRProcess {
f.delete();
}
}
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished recycling of " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
}
if ((args.length == 2) && (args[0].equals("-genrci"))) {
File root_path = new File(args[1]);
File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
rci_file.getParentFile().mkdirs();
genrci(cr_file, rci_file);
long start = System.currentTimeMillis();
int count = genrci(cr_file, rci_file);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
}
} catch (IOException e) {
e.printStackTrace();

@ -81,26 +81,27 @@ public final class serverCodings {
public char encodeBase64Byte(byte b) {
return alpha[b];
return (char) alpha[b];
}
public byte decodeBase64Byte(char b) {
return ahpla[b];
}
public String encodeBase64LongSmart(long c, int length) {
if (c >= maxBase64(length)) {
StringBuffer s = new StringBuffer(length);
s.setLength(length);
while (length > 0) {
s.insert(0,alpha[0x3F]);
length--;
s.setCharAt(--length, alpha[0]);
}
return s.toString();
} else {
return encodeBase64Long(c, length);
}
}
/*
public String encodeBase64Long(long c, int length) {
if (length < 0) length = 0;
StringBuffer s = new StringBuffer(length); //String s = "";
@ -118,6 +119,17 @@ public final class serverCodings {
while (s.length() < length) s.insert(0,alpha[0]); //s = alpha[0] + s;
return s.toString();
}
*/
public String encodeBase64Long(long c, int length) {
StringBuffer s = new StringBuffer(length);
s.setLength(length);
while (length > 0) {
s.setCharAt(--length, alpha[(byte) (c & 0x3F)]);
c >>= 6;
}
return s.toString();
}
public long decodeBase64Long(String s) {
while (s.endsWith("=")) s = s.substring(0, s.length() - 1);

@ -51,7 +51,7 @@ public class serverMemory {
private static final Runtime runtime = Runtime.getRuntime();
public static long free() {
// memory that is free without increasing of total memory takenn from os
// memory that is free without increasing of total memory taken from os
return runtime.freeMemory();
}

Loading…
Cancel
Save