From a12759c1bf8829659622f59bfa7cde2e77aa2f0c Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 17 Nov 2005 16:17:56 +0000 Subject: [PATCH] first try to implement a rci-computation from cr-files git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1103 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- bin/cr_accumulate | 3 +- bin/cr_genrci | 2 + bin/cr_recycle | 2 - .../de/anomic/kelondro/kelondroAttrSeq.java | 47 ++++++-------- ...RFile.java => plasmaRankingCRProcess.java} | 61 +++++++++++++++++-- 5 files changed, 79 insertions(+), 36 deletions(-) create mode 100755 bin/cr_genrci delete mode 100755 bin/cr_recycle rename source/de/anomic/plasma/{plasmaRankingCRFile.java => plasmaRankingCRProcess.java} (85%) diff --git a/bin/cr_accumulate b/bin/cr_accumulate index f3609905c..968dea72b 100755 --- a/bin/cr_accumulate +++ b/bin/cr_accumulate @@ -1,2 +1,3 @@ cd `dirname $0`/.. -java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -accumulate . +java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -accumulate . +java -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -recycle . 168 diff --git a/bin/cr_genrci b/bin/cr_genrci new file mode 100755 index 000000000..33614f4b0 --- /dev/null +++ b/bin/cr_genrci @@ -0,0 +1,2 @@ +cd `dirname $0`/.. +java -Xms300m -Xmx900m -classpath source:classes de.anomic.plasma.plasmaRankingCRProcess -genrci . diff --git a/bin/cr_recycle b/bin/cr_recycle deleted file mode 100755 index f9e002ba6..000000000 --- a/bin/cr_recycle +++ /dev/null @@ -1,2 +0,0 @@ -cd `dirname $0`/.. -java -classpath source:classes de.anomic.plasma.plasmaRankingCRFile -recycle . 168 diff --git a/source/de/anomic/kelondro/kelondroAttrSeq.java b/source/de/anomic/kelondro/kelondroAttrSeq.java index 13d56c361..2b321429c 100644 --- a/source/de/anomic/kelondro/kelondroAttrSeq.java +++ b/source/de/anomic/kelondro/kelondroAttrSeq.java @@ -80,7 +80,7 @@ public class kelondroAttrSeq { this.structure = null; this.created = 0; this.name = ""; - this.entries = readPropFile(file); + this.entries = readAttrFile(file); } public kelondroAttrSeq(String name, String struct) { @@ -95,14 +95,21 @@ public class kelondroAttrSeq { this.theLogger = newLogger; } + public void logInfo(String message) { + if (this.theLogger == null) + System.err.println("ATTRSEQ INFO for file " + this.file + ": " + message); + else + this.theLogger.info("ATTRSEQ INFO for file " + this.file + ": " + message); + } + public void logWarning(String message) { if (this.theLogger == null) - System.err.println("KELONDRO WARNING for file " + this.file + ": " + message); + System.err.println("ATTRSEQ WARNING for file " + this.file + ": " + message); else - this.theLogger.warning("KELONDRO WARNING for file " + this.file + ": " + message); + this.theLogger.warning("ATTRSEQ WARNING for file " + this.file + ": " + message); } - private TreeMap readPropFile(File file) throws IOException { + private TreeMap readAttrFile(File file) throws IOException { TreeMap entries = new TreeMap(); BufferedReader br = null; int p; @@ -173,6 +180,10 @@ public class kelondroAttrSeq { return entries.keySet().iterator(); } + public Entry newEntry(String pivot) { + return new Entry(pivot, new HashMap(), new TreeSet()); + } + public Entry newEntry(String pivot, HashMap props, TreeSet seq) { return new Entry(pivot, props, seq); } @@ -334,6 +345,10 @@ public class kelondroAttrSeq { this.seq = seq; } + public void addSeq(String s) { + this.seq.add(s); + } + public String toString() { // creates only the attribute field and the sequence, not the pivot StringBuffer sb = new StringBuffer(70); @@ -377,28 +392,4 @@ public class kelondroAttrSeq { } } - /* - Class-A File format: - - UDate : latest update timestamp of the URL (as virtual date, hours since epoch) - VDate : last visit timestamp of the URL (as virtual date, hours since epoch) - LCount : count of links to local resources - GCount : count of links to global resources - ICount : count of links to images (in document) - DCount : count of links to other documents - TLength: length of the plain text content (bytes) - WACount: total number of all words in content - WUCount: number of unique words in content (removed doubles) - Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote) - - Class-a File format is an extension of Class-A plus the following attributes - FUDate : first update timestamp of the URL - FDDate : first update timestamp of the domain - LUDate : latest update timestamp of the URL - UCount : Update Counter (of 'latest update timestamp') - PCount : Popularity Counter (proxy clicks) - ACount : Attention Counter (search result clicks) - VCount : Votes - Vita : Vitality (normed number of updates per time) - */ } diff --git a/source/de/anomic/plasma/plasmaRankingCRFile.java b/source/de/anomic/plasma/plasmaRankingCRProcess.java similarity index 85% rename from source/de/anomic/plasma/plasmaRankingCRFile.java rename to source/de/anomic/plasma/plasmaRankingCRProcess.java index 7078cb7fc..3284101ef 100644 --- a/source/de/anomic/plasma/plasmaRankingCRFile.java +++ b/source/de/anomic/plasma/plasmaRankingCRProcess.java @@ -1,4 +1,4 @@ -// plasmaCRFile.java +// plasmaCRProcess.java // ----------------------- // part of YaCy // (C) by Michael Peter Christen; mc@anomic.de @@ -54,7 +54,7 @@ import de.anomic.server.serverCodings; import de.anomic.server.serverFileUtils; import de.anomic.tools.bitfield; -public class plasmaRankingCRFile { +public class plasmaRankingCRProcess { /* header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10); @@ -202,8 +202,52 @@ public class plasmaRankingCRFile { } - public static long crFileCreated(File f) throws IOException { - return (new kelondroAttrSeq(f)).created(); + public static void genrci(File cr_in, File rci_out) throws IOException { + if (!(cr_in.exists())) return; + kelondroAttrSeq cr = new kelondroAttrSeq(cr_in); + kelondroAttrSeq rci; + if (!(rci_out.exists())) { + rci = new kelondroAttrSeq("Global Ranking Reverse Citation Index", + ",'='," + + "," + + "'|',*"); + rci.toFile(rci_out); + } + rci = new kelondroAttrSeq(rci_out); + + // loop over all referees + Iterator i = cr.keys(); + String referee, anchor, anchorDom; + kelondroAttrSeq.Entry cr_entry, rci_entry; + long cr_UDate, rci_UDate; + while (i.hasNext()) { + referee = (String) i.next(); + cr_entry = cr.getEntry(referee); + cr_UDate = cr_entry.getAttr("UDate", 0); + + // loop over all anchors + Iterator j = cr_entry.getSeq().iterator(); + while (j.hasNext()) { + // get domain of anchors + anchor = (String) j.next(); + if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6); + + // update domain-specific entry + rci_entry = rci.removeEntry(anchorDom); + if (rci_entry == null) rci_entry = rci.newEntry(anchorDom); + rci_entry.addSeq(referee); + + // update Update-Date + rci_UDate = rci_entry.getAttr("UDate", 0); + if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate); + + // insert entry + rci.addEntry(rci_entry); + } + } + + // finished. write to file + rci.toFile(rci_out); } public static void main(String[] args) { @@ -243,7 +287,7 @@ public class plasmaRankingCRFile { for (int i = 0; i < list.length; i++) { f = new File(acc_dir, list[i]); try { - d = (System.currentTimeMillis() - crFileCreated(f)) / 3600000; + d = (System.currentTimeMillis() - (new kelondroAttrSeq(f)).created()) / 3600000; if (d > max_age_hours) { // file is considered to be too old, it is not recycled System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup"); @@ -261,6 +305,13 @@ public class plasmaRankingCRFile { } } } + if ((args.length == 2) && (args[0].equals("-genrci"))) { + File root_path = new File(args[1]); + File cr_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/CRG-a-acc.cr.gz"); + File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz"); + rci_file.getParentFile().mkdirs(); + genrci(cr_file, rci_file); + } } catch (IOException e) { e.printStackTrace(); }