first try to implement a rci-computation from cr-files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1103 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 23dc904e0e
commit a12759c1bf

@ -1,2 +1,3 @@
cd `dirname $0`/..
java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -accumulate .
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate .
java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168

@ -0,0 +1,2 @@
cd `dirname $0`/..
java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci .

@ -1,2 +0,0 @@
cd `dirname $0`/..
java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -recycle . 168

@ -80,7 +80,7 @@ public class kelondroAttrSeq {
this.structure = null;
this.created = 0;
this.name = "";
this.entries = readPropFile(file);
this.entries = readAttrFile(file);
}
public kelondroAttrSeq(String name, String struct) {
@ -95,14 +95,21 @@ public class kelondroAttrSeq {
this.theLogger = newLogger;
}
public void logInfo(String message) {
if (this.theLogger == null)
System.err.println("ATTRSEQ INFO for file " + this.file + ": " + message);
else
this.theLogger.info("ATTRSEQ INFO for file " + this.file + ": " + message);
}
public void logWarning(String message) {
if (this.theLogger == null)
System.err.println("KELONDRO WARNING for file " + this.file + ": " + message);
System.err.println("ATTRSEQ WARNING for file " + this.file + ": " + message);
else
this.theLogger.warning("KELONDRO WARNING for file " + this.file + ": " + message);
this.theLogger.warning("ATTRSEQ WARNING for file " + this.file + ": " + message);
}
private TreeMap readPropFile(File file) throws IOException {
private TreeMap readAttrFile(File file) throws IOException {
TreeMap entries = new TreeMap();
BufferedReader br = null;
int p;
@ -173,6 +180,10 @@ public class kelondroAttrSeq {
return entries.keySet().iterator();
}
public Entry newEntry(String pivot) {
return new Entry(pivot, new HashMap(), new TreeSet());
}
public Entry newEntry(String pivot, HashMap props, TreeSet seq) {
return new Entry(pivot, props, seq);
}
@ -334,6 +345,10 @@ public class kelondroAttrSeq {
this.seq = seq;
}
public void addSeq(String s) {
this.seq.add(s);
}
public String toString() {
// creates only the attribute field and the sequence, not the pivot
StringBuffer sb = new StringBuffer(70);
@ -377,28 +392,4 @@ public class kelondroAttrSeq {
}
}
/*
Class-A File format:
UDate : latest update timestamp of the URL (as virtual date, hours since epoch)
VDate : last visit timestamp of the URL (as virtual date, hours since epoch)
LCount : count of links to local resources
GCount : count of links to global resources
ICount : count of links to images (in document)
DCount : count of links to other documents
TLength: length of the plain text content (bytes)
WACount: total number of all words in content
WUCount: number of unique words in content (removed doubles)
Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote)
Class-a File format is an extension of Class-A plus the following attributes
FUDate : first update timestamp of the URL
FDDate : first update timestamp of the domain
LUDate : latest update timestamp of the URL
UCount : Update Counter (of 'latest update timestamp')
PCount : Popularity Counter (proxy clicks)
ACount : Attention Counter (search result clicks)
VCount : Votes
Vita : Vitality (normed number of updates per time)
*/
}

@ -1,4 +1,4 @@
// plasmaCRFile.java
// plasmaCRProcess.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
@ -54,7 +54,7 @@ import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.tools.bitfield;
public class plasmaRankingCRFile {
public class plasmaRankingCRProcess {
/*
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
@ -202,8 +202,52 @@ public class plasmaRankingCRFile {
}
public static long crFileCreated(File f) throws IOException {
return (new kelondroAttrSeq(f)).created();
public static void genrci(File cr_in, File rci_out) throws IOException {
if (!(cr_in.exists())) return;
kelondroAttrSeq cr = new kelondroAttrSeq(cr_in);
kelondroAttrSeq rci;
if (!(rci_out.exists())) {
rci = new kelondroAttrSeq("Global Ranking Reverse Citation Index",
"<AnchorDom-6>,'='," +
"<UDate-3>," +
"'|',*<Referee-12>");
rci.toFile(rci_out);
}
rci = new kelondroAttrSeq(rci_out);
// loop over all referees
Iterator i = cr.keys();
String referee, anchor, anchorDom;
kelondroAttrSeq.Entry cr_entry, rci_entry;
long cr_UDate, rci_UDate;
while (i.hasNext()) {
referee = (String) i.next();
cr_entry = cr.getEntry(referee);
cr_UDate = cr_entry.getAttr("UDate", 0);
// loop over all anchors
Iterator j = cr_entry.getSeq().iterator();
while (j.hasNext()) {
// get domain of anchors
anchor = (String) j.next();
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
// update domain-specific entry
rci_entry = rci.removeEntry(anchorDom);
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom);
rci_entry.addSeq(referee);
// update Update-Date
rci_UDate = rci_entry.getAttr("UDate", 0);
if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate);
// insert entry
rci.addEntry(rci_entry);
}
}
// finished. write to file
rci.toFile(rci_out);
}
public static void main(String[] args) {
@ -243,7 +287,7 @@ public class plasmaRankingCRFile {
for (int i = 0; i < list.length; i++) {
f = new File(acc_dir, list[i]);
try {
d = (System.currentTimeMillis() - crFileCreated(f)) / 3600000;
d = (System.currentTimeMillis() - (new kelondroAttrSeq(f)).created()) / 3600000;
if (d > max_age_hours) {
// file is considered to be too old, it is not recycled
System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup");
@ -261,6 +305,13 @@ public class plasmaRankingCRFile {
}
}
}
if ((args.length == 2) && (args[0].equals("-genrci"))) {
File root_path = new File(args[1]);
File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz");
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
rci_file.getParentFile().mkdirs();
genrci(cr_file, rci_file);
}
} catch (IOException e) {
e.printStackTrace();
}
Loading…
Cancel
Save