enhanced CR-file handling and added first RCI-evaluation tests

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1110 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent cfdc0c89c3
commit 0ec54d9c5f

@ -0,0 +1,2 @@
cd `dirname $0`/..
java -server -Xms400m -Xmx400m -classpath source:classes de.anomic.plasma.plasmaRankingRCIEvaluation -rcieval .

@ -150,7 +150,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public static String urlNormalform(URL url) {
boolean defaultPort = false;
serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
//serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
if (url.getProtocol().equals("http")) {
if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; }
} else if (url.getProtocol().equals("ftp")) {

@ -231,7 +231,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// load the yellow-list
f = switchboard.getConfig("proxyYellowList", null);
if (f != null) {
yellowList = serverFileUtils.loadSet(f);
yellowList = serverFileUtils.loadList(f);
this.theLogger.logConfig("loaded yellow-list from file " + f + ", " + yellowList.size() + " entries");
} else {
yellowList = new HashSet();

@ -50,11 +50,10 @@ import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.Map;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.Iterator;
import java.util.Map;
import java.util.ArrayList;
import java.util.StringTokenizer;
import java.util.zip.GZIPInputStream;
@ -186,11 +185,11 @@ public class kelondroAttrSeq {
return entries.keySet().iterator();
}
public Entry newEntry(String pivot) {
return new Entry(pivot, new HashMap(), new TreeSet());
public Entry newEntry(String pivot, boolean tree) {
return new Entry(pivot, new HashMap(), (tree) ? (Map) new TreeMap() : (Map) new HashMap());
}
public Entry newEntry(String pivot, HashMap props, TreeSet seq) {
public Entry newEntry(String pivot, HashMap props, Map seq) {
return new Entry(pivot, props, seq);
}
@ -210,7 +209,7 @@ public class kelondroAttrSeq {
public Entry getEntry(String pivot) {
Object e = entries.get(pivot);
if (e == null) return null;
if (e instanceof String) return new Entry(pivot, (String) e);
if (e instanceof String) return new Entry(pivot, (String) e, false);
if (e instanceof Entry) return (Entry) e;
return null;
}
@ -218,7 +217,7 @@ public class kelondroAttrSeq {
public Entry removeEntry(String pivot) {
Object e = entries.remove(pivot);
if (e == null) return null;
if (e instanceof String) return new Entry(pivot, (String) e);
if (e instanceof String) return new Entry(pivot, (String) e, false);
if (e instanceof Entry) return (Entry) e;
return null;
}
@ -229,8 +228,8 @@ public class kelondroAttrSeq {
protected int pivot_len = -1;
protected String[] prop_names = null;
protected int[] prop_len = null, prop_pos = null;
protected String seq_name = null;
protected int seq_len = -1;
protected String[] seq_names = null;
protected int[] seq_len = null, seq_pos = null;
// example:
//# Structure=<pivot-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-12>
@ -273,17 +272,30 @@ public class kelondroAttrSeq {
}
// parse sequence definition:
a = atom(seqs);
if (a == null) return;
seq_name = (String) a[0];
seq_len = ((Integer) a[1]).intValue();
if (seqs.startsWith("*")) seqs = seqs.substring(1);
l = new ArrayList();
st = new StringTokenizer(seqs, ",");
while (st.hasMoreTokens()) {
a = atom(st.nextToken());
if (a == null) break;
l.add(a);
}
seq_names = new String[l.size()];
seq_len = new int[l.size()];
seq_pos = new int[l.size()];
p = 0;
for (int i = 0; i < l.size(); i++) {
a = (Object[]) l.get(i);
seq_names[i] = (String) a[0];
seq_len[i] = ((Integer) a[1]).intValue();
seq_pos[i] = p;
p += seq_len[i];
}
}
private Object[] atom(String a) {
if (a.startsWith("<")) {
a = a.substring(1);
} else if (a.startsWith("*<")) {
a = a.substring(2);
} else return null;
if (a.endsWith(">")) {
a = a.substring(0, a.length() - 1);
@ -307,8 +319,12 @@ public class kelondroAttrSeq {
sb.append('<'); sb.append(prop_names[i]); sb.append('-'); sb.append(Integer.toString(prop_len[i])); sb.append(">,");
}
}
sb.append("'|',");
sb.append("*<"); sb.append(seq_name); sb.append('-'); sb.append(Integer.toString(seq_len)); sb.append('>');
sb.append("'|'");
if (seq_names.length > 0) {
for (int i = 0; i < seq_names.length; i++) {
sb.append(",<"); sb.append(seq_names[i]); sb.append('-'); sb.append(Integer.toString(seq_len[i])); sb.append('>');
}
}
return sb.toString();
}
}
@ -316,25 +332,33 @@ public class kelondroAttrSeq {
public class Entry {
String pivot;
HashMap attrs;
TreeSet seq;
Map seq;
public Entry(String pivot, HashMap attrs, TreeSet seq) {
public Entry(String pivot, HashMap attrs, Map seq) {
this.pivot = pivot;
this.attrs = attrs;
this.seq = seq;
}
public Entry(String pivot, String attrseq) {
public Entry(String pivot, String attrseq, boolean tree) {
this.pivot = pivot;
attrs = new HashMap();
seq = new TreeSet();
seq = (tree) ? (Map) new TreeMap() : (Map) new HashMap();
for (int i = 0; i < structure.prop_names.length; i++) {
attrs.put(structure.prop_names[i], new Long(serverCodings.enhancedCoder.decodeBase64Long(attrseq.substring(structure.prop_pos[i], structure.prop_pos[i] + structure.prop_len[i]))));
}
int p = attrseq.indexOf('|');
for (int i = p + 1; i < attrseq.length(); i = i + structure.seq_len) {
seq.add(attrseq.substring(i, i + structure.seq_len));
int p = attrseq.indexOf('|') + 1;
long[] seqattrs = new long[structure.seq_names.length - 1];
String seqname;
while (p < attrseq.length()) {
seqname = attrseq.substring(p, p + structure.seq_len[0]);
p += structure.seq_len[0];
for (int i = 1; i < structure.seq_names.length; i++) {
seqattrs[i - 1] = serverCodings.enhancedCoder.decodeBase64Long(attrseq.substring(p, p + structure.seq_len[i]));
p += structure.seq_len[i];
}
seq.put(seqname, seqattrs);
}
}
@ -351,30 +375,37 @@ public class kelondroAttrSeq {
attrs.put(key, new Long(attr));
}
public TreeSet getSeq() {
public Map getSeq() {
return seq;
}
public void setSeq(TreeSet seq) {
public void setSeq(Map seq) {
this.seq = seq;
}
public void addSeq(String s) {
this.seq.add(s);
public void addSeq(String s, long[] seqattrs) {
this.seq.put(s, seqattrs);
}
public String toString() {
// creates only the attribute field and the sequence, not the pivot
StringBuffer sb = new StringBuffer(100 + structure.seq_len * seq.size());
StringBuffer sb = new StringBuffer(100 + structure.seq_len[0] * seq.size());
Long val;
for (int i = 0; i < structure.prop_names.length; i++) {
val = (Long) attrs.get(structure.prop_names[i]);
sb.append(serverCodings.enhancedCoder.encodeBase64LongSmart((val == null) ? 0 : val.longValue(), structure.prop_len[i]));
}
sb.append('|');
Iterator q = seq.iterator();
Iterator q = seq.entrySet().iterator();
Map.Entry entry;
long[] seqattrs;
while (q.hasNext()) {
sb.append((String) q.next());
entry = (Map.Entry) q.next();
sb.append((String) entry.getKey());
seqattrs = (long[]) entry.getValue();
for (int i = 1; i < structure.seq_names.length; i++) {
sb.append(serverCodings.enhancedCoder.encodeBase64Long(seqattrs[i - 1], structure.seq_len[i]));
}
}
return sb.toString();
}

@ -48,6 +48,7 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
@ -230,16 +231,18 @@ public class plasmaRankingCRProcess {
cr_UDate = cr_entry.getAttr("UDate", 0);
// loop over all anchors
Iterator j = cr_entry.getSeq().iterator();
Iterator j = cr_entry.getSeq().entrySet().iterator();
Map.Entry entry;
while (j.hasNext()) {
// get domain of anchors
anchor = (String) j.next();
entry = (Map.Entry) j.next();
anchor = (String) entry.getKey();
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
// update domain-specific entry
rci_entry = rci.getEntry(anchorDom);
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom);
rci_entry.addSeq(referee);
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom, false);
rci_entry.addSeq(referee, null);
// update Update-Date
rci_UDate = rci_entry.getAttr("UDate", 0);

@ -47,6 +47,7 @@ package de.anomic.plasma;
import java.io.IOException;
import java.io.File;
import java.util.Random;
import java.util.StringTokenizer;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeedDB;
@ -75,6 +76,19 @@ public final class plasmaRankingDistribution {
private String address[]; // of fixed other peer
private static Random random = new Random(System.currentTimeMillis());
public plasmaRankingDistribution(serverLog log, File sourcePath, int method, int percentage, String addresses) {
this.log = log;
this.sourcePath = sourcePath;
this.method = method;
this.percentage = percentage;
StringTokenizer st = new StringTokenizer(addresses, ",");
int c = 0; while (st.hasMoreTokens()) {st.nextToken(); c++;}
st = new StringTokenizer(addresses, ",");
this.address = new String[c];
c = 0;
while (st.hasMoreTokens()) {this.address[c++] = st.nextToken();}
}
public plasmaRankingDistribution(serverLog log, File sourcePath, int method, int percentage, String address[]) {
this.log = log;
this.sourcePath = sourcePath;

@ -0,0 +1,145 @@
// plasmaRCIEvaluation.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2005
// Created 18.11.2005
//
// $LastChangedDate: 2005-10-22 15:28:04 +0200 (Sat, 22 Oct 2005) $
// $LastChangedRevision: 968 $
// $LastChangedBy: theli $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.HashMap;
import de.anomic.kelondro.kelondroAttrSeq;
import de.anomic.server.serverCodings;
import de.anomic.server.serverFileUtils;
import de.anomic.tools.bitfield;
public class plasmaRankingRCIEvaluation {
public static int[] rcieval(File rci_file) throws IOException {
// collect information about which entry has how many references
// the output is a reference-count:occurrences relation
if (!(rci_file.exists())) return null;
final kelondroAttrSeq rci = new kelondroAttrSeq(rci_file, false);
HashMap counts = new HashMap();
Iterator i = rci.keys();
String key;
kelondroAttrSeq.Entry entry;
Integer count_key, count_count;
int c, maxcount = 0;
while (i.hasNext()) {
key = (String) i.next();
entry = rci.getEntry(key);
c = entry.getSeq().size();
if (c > maxcount) maxcount = c;
count_key = new Integer(c);
count_count = (Integer) counts.get(count_key);
if (count_count == null) {
count_count = new Integer(1);
} else {
count_count = new Integer(count_count.intValue() + 1);
}
counts.put(count_key, count_count);
}
int[] ctable = new int[maxcount + 1];
for (int j = 0; j <= maxcount; j++) {
count_count = (Integer) counts.get(new Integer(j));
if (count_count == null) {
ctable[j] = 0;
} else {
ctable[j] = count_count.intValue();
}
}
return ctable;
}
public static long sum(int[] c) {
long s = 0;
for (int i = 0; i < c.length; i++) s += (long) c[i];
return s;
}
public static int[] interval(int[] counts, int parts) {
long limit = sum(counts) / 2;
int[] pos = new int[parts];
int s = 0, p = parts - 1;
for (int i = 0; i < counts.length; i++) {
s += counts[i];
if ((s > limit) && (p >= 0)) {
pos[p--] = i - 1;
limit = (2 * limit - s + counts[i]) / 2;
s = counts[i];
}
}
pos[0] = counts.length - 1;
return pos;
}
public static void main(String[] args) {
try {
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
File root_path = new File(args[1]);
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
long start = System.currentTimeMillis();
int count[] = rcieval(rci_file);
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
System.out.println("Finished RCI evaluation in " + seconds + " seconds");
/*
System.out.println("count table:");
for (int i = 0; i < count.length; i++) {
System.out.println(i + " references: " + count[i] + " times");
}
*/
int[] pos = interval(count, 16);
System.out.println("partition position table:");
for (int i = 0; i < pos.length; i++) {
System.out.println("position " + i + ": " + pos[i]);
}
}
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -431,8 +431,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
cleanProfiles();
// init ranking transmission
rankingOwnDistribution = new plasmaRankingDistribution(log, new File(rankingPath, plasmaRankingDistribution.CR_OWN), plasmaRankingDistribution.METHOD_ANYSENIOR, 0, null);
rankingOtherDistribution = new plasmaRankingDistribution(log, new File(rankingPath, plasmaRankingDistribution.CR_OTHER), plasmaRankingDistribution.METHOD_MIXEDSENIOR, 30, new String[]{"kaskelix.de:8080", "yacy.dyndns.org:8000", "suma-lab.de:8080"});
/*
CRDist0Path = GLOBAL/010_owncr
CRDist0Method = 1
CRDist0Percent = 0
CRDist0Target =
CRDist1Path = GLOBAL/014_othercr/1
CRDist1Method = 9
CRDist1Percent = 30
CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
**/
rankingOwnDistribution = new plasmaRankingDistribution(log, new File(rankingPath, getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN)), (int) getConfigLong("CRDist0Method", plasmaRankingDistribution.METHOD_ANYSENIOR), (int) getConfigLong("CRDist0Percent", 0), getConfig("CRDist0Target", ""));
rankingOtherDistribution = new plasmaRankingDistribution(log, new File(rankingPath, getConfig("CRDist1Path", plasmaRankingDistribution.CR_OTHER)), (int) getConfigLong("CRDist1Method", plasmaRankingDistribution.METHOD_MIXEDSENIOR), (int) getConfigLong("CRDist1Percent", 30), getConfig("CRDist1Target", "kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080"));
// init facility DB
/*
@ -1411,7 +1421,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
private void flushCitationReference(StringBuffer cr, String type) {
if (cr.length() < 12) return;
String filename = type.toUpperCase() + "-A-" + new serverDate().toShortString(true) + "." + cr.substring(0, 12) + ".cr.gz";
File path = new File(rankingPath, (type.equals("crl") ? "LOCAL/010_cr/" : "GLOBAL/010_owncr/"));
File path = new File(rankingPath, (type.equals("crl") ? "LOCAL/010_cr/" : getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN)));
path.mkdirs();
File file = new File(path, filename);

@ -53,9 +53,12 @@ import java.io.InputStreamReader;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.PrintWriter;
import java.util.StringTokenizer;
import java.util.zip.GZIPOutputStream;
import java.util.HashSet;
import java.util.Set;
import java.util.Map;
import java.util.HashSet;
import java.util.TreeSet;
import java.util.Properties;
import java.util.Hashtable;
import java.util.Iterator;
@ -173,7 +176,7 @@ public final class serverFileUtils {
copy(new ByteArrayInputStream(source), dest);
}
public static HashSet loadSet(String filename) {
public static HashSet loadList(String filename) {
HashSet set = new HashSet();
BufferedReader br = null;
try {
@ -226,6 +229,44 @@ public final class serverFileUtils {
tf.renameTo(file);
}
public static Set loadSet(File file, int chunksize, boolean tree) throws IOException {
Set set = (tree) ? (Set) new TreeSet() : (Set) new HashSet();
byte[] b = read(file);
for (int i = 0; (i + chunksize) <= b.length; i++) {
set.add(new String(b, i, chunksize));
}
return set;
}
public static Set loadSet(File file, String sep, boolean tree) throws IOException {
Set set = (tree) ? (Set) new TreeSet() : (Set) new HashSet();
byte[] b = read(file);
StringTokenizer st = new StringTokenizer(new String(b), sep);
while (st.hasMoreTokens()) {
set.add(st.nextToken());
}
return set;
}
public static void saveSet(File file, Set set, String sep) throws IOException {
File tf = new File(file.toString() + "." + (System.currentTimeMillis() % 1000));
BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(tf));
Iterator i = set.iterator();
String key;
if (i.hasNext()) {
key = i.next().toString();
bos.write(key.getBytes());
}
while (i.hasNext()) {
key = i.next().toString();
if (sep != null) bos.write(sep.getBytes());
bos.write(key.getBytes());
}
bos.close();
file.delete();
tf.renameTo(file);
}
public static void moveAll(File from_dir, File to_dir) {
if (!(from_dir.isDirectory())) return;
if (!(to_dir.isDirectory())) return;

@ -571,8 +571,8 @@ indexer.slots = 100
# was set by the client.
useYacyReferer = true
#allow only 443(https-port) for https-proxy?
#if you want to tunnel other protokols, set to false
# allow only 443(https-port) for https-proxy?
# if you want to tunnel other protokols, set to false
secureHttps = true
# specifies if the httpdFileHandler should cache
@ -600,6 +600,22 @@ indexTransfer.timeout = 120000
indexDistribution.maxOpenFiles = 800
indexTransfer.maxOpenFiles = 800
# Distribution of Citation-Reference (CR-) files
# The distribution is done in two steps:
# first step to anonymize the records
# second step to forward to collecting peer
# to anonymize the data even against the intermediate peer
# a specific precentage is also sent again to other peers.
# for key-numbers please see de.anomic.plasma.plasmaRankingDistribution
CRDist0Path = GLOBAL/010_owncr
CRDist0Method = 1
CRDist0Percent = 0
CRDist0Target =
CRDist1Path = GLOBAL/014_othercr/1
CRDist1Method = 9
CRDist1Percent = 30
CRDist1Target = kaskelix.de:8080,yacy.dyndns.org:8000,suma-lab.de:8080
#
storagePeerHash =

Loading…
Cancel
Save