this was never used and extended in the last years. The resulting YBR ranking criteria is still a good idea and will be used in the future. Possible generation methods for YBR ranking are: - "trust-rank" using the link structure as can be discovered in a single crawl (idea from FSCONS) - "block-rank" calculated from the local link structure - a distributed "block-rank" using the xml API to the link structure from other peers git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7349 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
3d945bb442
commit
a9f754c45f
@ -1,12 +0,0 @@
|
|||||||
version=#[version]#
|
|
||||||
uptime=#[uptime]#
|
|
||||||
response=#[response]#
|
|
||||||
#(process)#
|
|
||||||
access=#[access]#
|
|
||||||
address=#[address]#
|
|
||||||
protocol=#[protocol]#
|
|
||||||
path=#[path]#
|
|
||||||
maxsize=#[maxsize]#
|
|
||||||
::
|
|
||||||
tt=#[tt]#
|
|
||||||
#(/process)#
|
|
@ -1,151 +0,0 @@
|
|||||||
// transfer.java
|
|
||||||
// -----------------------
|
|
||||||
// part of YaCy caching proxy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2004
|
|
||||||
// created 07.11.2005
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import net.yacy.cora.protocol.HeaderFramework;
|
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
|
||||||
import net.yacy.kelondro.order.Digest;
|
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
|
||||||
|
|
||||||
import de.anomic.search.Switchboard;
|
|
||||||
import de.anomic.search.blockrank.CRDistribution;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
import de.anomic.yacy.yacyNetwork;
|
|
||||||
import de.anomic.yacy.yacySeed;
|
|
||||||
|
|
||||||
public final class transfer {
|
|
||||||
|
|
||||||
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
final Switchboard sb = (Switchboard) env;
|
|
||||||
final serverObjects prop = new serverObjects();
|
|
||||||
if ((post == null) || (env == null)) return prop;
|
|
||||||
if (!yacyNetwork.authentifyRequest(post, env)) return prop;
|
|
||||||
|
|
||||||
final String process = post.get("process", ""); // permission or store
|
|
||||||
//String key = post.get("key", ""); // a transmission key from the client
|
|
||||||
final String otherpeer = post.get("iam", ""); // identification of the client (a peer-hash)
|
|
||||||
final String purpose = post.get("purpose", ""); // declares how the file shall be treated
|
|
||||||
final String filename = post.get("filename", ""); // a name of a file without path
|
|
||||||
//long filesize = Long.parseLong((String) post.get("filesize", "")); // the size of the file
|
|
||||||
|
|
||||||
prop.put("process", "0");
|
|
||||||
prop.put("response", "denied"); // reject is default and is overwritten if ok
|
|
||||||
prop.put("process_access", "");
|
|
||||||
prop.put("process_address", "");
|
|
||||||
prop.put("process_protocol", "");
|
|
||||||
prop.put("process_path", "");
|
|
||||||
prop.put("process_maxsize", "0");
|
|
||||||
|
|
||||||
if (sb.isRobinsonMode() || !sb.rankingOn) {
|
|
||||||
// in a robinson environment, do not answer. We do not do any transfer in a robinson cluster.
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
final yacySeed otherseed = sb.peers.get(otherpeer);
|
|
||||||
if ((otherseed == null) || (filename.indexOf("..") >= 0)) {
|
|
||||||
// reject unknown peers: this does not appear fair, but anonymous senders are dangerous
|
|
||||||
// reject paths that contain '..' because they are dangerous
|
|
||||||
if (sb.getLog().isFine()) {
|
|
||||||
if (otherseed == null) sb.getLog().logFine("RankingTransmission: rejected unknown peer '" + otherpeer + "', current IP " + header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "unknown"));
|
|
||||||
if (filename.indexOf("..") >= 0) sb.getLog().logFine("RankingTransmission: rejected wrong path '" + filename + "' from peer " + (otherseed == null ? "null" : otherseed.getName() + "/" + otherseed.getPublicAddress()) + ", current IP " + header.get(HeaderFramework.CONNECTION_PROP_CLIENTIP, "unknown"));
|
|
||||||
}
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
final String otherpeerName = otherseed.hash + ":" + otherseed.getName();
|
|
||||||
|
|
||||||
if (process.equals("permission")) {
|
|
||||||
prop.put("process", "0");
|
|
||||||
if (((purpose.equals("crcon")) && (filename.startsWith("CRG")) && (filename.endsWith(".cr.gz"))) || ((filename.startsWith("domlist")) && (filename.endsWith(".txt.gz") || filename.endsWith(".zip")))) {
|
|
||||||
// consolidation of cr files
|
|
||||||
//System.out.println("yacy/transfer:post=" + post.toString());
|
|
||||||
//String cansendprotocol = (String) post.get("can-send-protocol", "http");
|
|
||||||
final String access = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(otherpeer + ":" + filename)) + ":" + Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis())));
|
|
||||||
prop.put("response", "ok");
|
|
||||||
prop.put("process_access", access);
|
|
||||||
prop.put("process_address", sb.peers.mySeed().getPublicAddress());
|
|
||||||
prop.put("process_protocol", "http");
|
|
||||||
prop.put("process_path", ""); // currently empty; the store process will find a path
|
|
||||||
prop.put("process_maxsize", "-1"); // if response is too big we return the size of the file
|
|
||||||
sb.rankingPermissions.put(Digest.encodeMD5Hex(Base64Order.standardCoder.encodeString(access)), filename);
|
|
||||||
if (sb.getLog().isFine()) sb.getLog().logFine("RankingTransmission: granted peer " + otherpeerName + " to send CR file " + filename);
|
|
||||||
}
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.equals("store")) {
|
|
||||||
prop.put("process", "1");
|
|
||||||
if (purpose.equals("crcon")) {
|
|
||||||
final String fileString = post.get("filename$file");
|
|
||||||
final String accesscode = post.get("access", ""); // one-time authentication
|
|
||||||
final String md5 = post.get("md5", ""); // one-time authentication
|
|
||||||
//java.util.HashMap perm = sb.rankingPermissions;
|
|
||||||
//System.out.println("PERMISSIONDEBUG: accesscode=" + accesscode + ", permissions=" + perm.toString());
|
|
||||||
final String grantedFile = sb.rankingPermissions.get(accesscode);
|
|
||||||
prop.put("process_tt", "");
|
|
||||||
if ((grantedFile == null) || (!(grantedFile.equals(filename)))) {
|
|
||||||
// fraud-access of this interface
|
|
||||||
prop.put("response", "denied");
|
|
||||||
if (sb.getLog().isFine()) sb.getLog().logFine("RankingTransmission: denied " + otherpeerName + " to send CR file " + filename + ": wrong access code");
|
|
||||||
} else {
|
|
||||||
sb.rankingPermissions.remove(accesscode); // not needed any more
|
|
||||||
final File path = new File(sb.rankingPath, CRDistribution.CR_OTHER);
|
|
||||||
path.mkdirs();
|
|
||||||
final File file = new File(path, filename);
|
|
||||||
try {
|
|
||||||
if (file.getCanonicalPath().startsWith(path.getCanonicalPath())){
|
|
||||||
FileUtils.copy(fileString.getBytes(), file);
|
|
||||||
final String md5t = Digest.encodeMD5Hex(file);
|
|
||||||
if (md5t.equals(md5)) {
|
|
||||||
prop.put("response", "ok");
|
|
||||||
if (sb.getLog().isFine()) sb.getLog().logFine("RankingTransmission: received from peer " + otherpeerName + " CR file " + filename);
|
|
||||||
} else {
|
|
||||||
prop.put("response", "transfer failure");
|
|
||||||
if (sb.getLog().isFine()) sb.getLog().logFine("RankingTransmission: transfer failure from peer " + otherpeerName + " for CR file " + filename);
|
|
||||||
}
|
|
||||||
}else{
|
|
||||||
//exploit?
|
|
||||||
prop.put("response", "io error");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
prop.put("response", "io error");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
// wrong access
|
|
||||||
if (sb.getLog().isFine()) sb.getLog().logFine("RankingTransmission: rejected unknown process " + process + ":" + purpose + " from peer " + otherpeerName);
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,196 +0,0 @@
|
|||||||
// plasmaRankingDistribution.java
|
|
||||||
// -------------------------------------------
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2005
|
|
||||||
// created 9.11.2005
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package de.anomic.search.blockrank;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.StringTokenizer;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
|
||||||
|
|
||||||
import de.anomic.yacy.yacyClient;
|
|
||||||
import de.anomic.yacy.yacySeed;
|
|
||||||
import de.anomic.yacy.yacySeedDB;
|
|
||||||
import de.anomic.yacy.yacyVersion;
|
|
||||||
|
|
||||||
public final class CRDistribution {
|
|
||||||
|
|
||||||
public static final String CR_OWN = "GLOBAL/010_owncr";
|
|
||||||
public static final String CR_OTHER = "GLOBAL/014_othercr/";
|
|
||||||
|
|
||||||
public static final int METHOD_NONE = 0;
|
|
||||||
public static final int METHOD_ANYSENIOR = 1;
|
|
||||||
public static final int METHOD_ANYPRINCIPAL = 2;
|
|
||||||
public static final int METHOD_MIXEDSENIOR = 9;
|
|
||||||
public static final int METHOD_MIXEDPRINCIPAL = 10;
|
|
||||||
public static final int METHOD_FIXEDADDRESS = 99;
|
|
||||||
|
|
||||||
private final Log log;
|
|
||||||
private final File sourcePath; // where to load CR-files
|
|
||||||
private int method; // of peer selection
|
|
||||||
private int percentage; // to select any other peer
|
|
||||||
private String address[]; // of fixed other peer
|
|
||||||
private final yacySeedDB seedDB;
|
|
||||||
private static Random random = new Random(System.currentTimeMillis());
|
|
||||||
|
|
||||||
public CRDistribution(final Log log, final yacySeedDB seedDB, final File sourcePath, final int method, final int percentage, final String addresses) {
|
|
||||||
this.log = log;
|
|
||||||
this.seedDB = seedDB;
|
|
||||||
this.sourcePath = sourcePath;
|
|
||||||
this.method = method;
|
|
||||||
this.percentage = percentage;
|
|
||||||
StringTokenizer st = new StringTokenizer(addresses, ",");
|
|
||||||
int c = 0; while (st.hasMoreTokens()) {st.nextToken(); c++;}
|
|
||||||
st = new StringTokenizer(addresses, ",");
|
|
||||||
this.address = new String[c];
|
|
||||||
c = 0;
|
|
||||||
while (st.hasMoreTokens()) {this.address[c++] = st.nextToken();}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setMethod(final int method, final int percentage, final String address[]) {
|
|
||||||
this.method = method;
|
|
||||||
this.percentage = percentage;
|
|
||||||
this.address = address;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
if ((sourcePath.exists()) && (sourcePath.isDirectory()))
|
|
||||||
return sourcePath.list().length;
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean transferRanking(int count) throws InterruptedException {
|
|
||||||
|
|
||||||
if (method == METHOD_NONE) {
|
|
||||||
log.logFine("no ranking distribution: no transfer method given");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (seedDB == null) {
|
|
||||||
log.logFine("no ranking distribution: seedDB == null");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (seedDB.mySeed() == null) {
|
|
||||||
log.logFine("no ranking distribution: mySeed == null");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (seedDB.mySeed().isVirgin()) {
|
|
||||||
log.logFine("no ranking distribution: status is virgin");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
final String[] outfiles = sourcePath.list();
|
|
||||||
|
|
||||||
if (outfiles == null) {
|
|
||||||
log.logFine("no ranking distribution: source path does not exist");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (outfiles.length == 0) {
|
|
||||||
log.logFine("no ranking distribution: source path does not contain any file");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (outfiles.length < count) count = outfiles.length;
|
|
||||||
File crfile = null;
|
|
||||||
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
// check for interruption
|
|
||||||
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
|
|
||||||
|
|
||||||
// getting the next file to transfer
|
|
||||||
crfile = new File(sourcePath, outfiles[i]);
|
|
||||||
|
|
||||||
if ((method == METHOD_ANYSENIOR) || (method == METHOD_ANYPRINCIPAL)) {
|
|
||||||
transferRankingAnySeed(crfile, 5);
|
|
||||||
}
|
|
||||||
if (method == METHOD_FIXEDADDRESS) {
|
|
||||||
transferRankingAddress(crfile);
|
|
||||||
}
|
|
||||||
if ((method == METHOD_MIXEDSENIOR) || (method == METHOD_MIXEDPRINCIPAL)) {
|
|
||||||
if (random.nextInt(100) > percentage) {
|
|
||||||
if (!(transferRankingAddress(crfile))) transferRankingAnySeed(crfile, 5);
|
|
||||||
} else {
|
|
||||||
if (!(transferRankingAnySeed(crfile, 5))) transferRankingAddress(crfile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
log.logFine("no ranking distribution: no target available");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean transferRankingAnySeed(final File crfile, final int trycount) throws InterruptedException {
|
|
||||||
yacySeed target = null;
|
|
||||||
for (int j = 0; j < trycount; j++) {
|
|
||||||
// check for interruption
|
|
||||||
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
|
|
||||||
|
|
||||||
// get next target
|
|
||||||
target = seedDB.anySeedVersion(yacyVersion.YACY_ACCEPTS_RANKING_TRANSMISSION);
|
|
||||||
|
|
||||||
if (target == null) continue;
|
|
||||||
final String targetaddress = target.getPublicAddress();
|
|
||||||
if (transferRankingAddress(crfile, targetaddress)) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean transferRankingAddress(final File crfile) throws InterruptedException {
|
|
||||||
// try all addresses
|
|
||||||
for (int i = 0; i < this.address.length; i++) {
|
|
||||||
// check for interruption
|
|
||||||
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
|
|
||||||
|
|
||||||
// try to transfer ranking address using the next address
|
|
||||||
if (transferRankingAddress(crfile, this.address[i])) return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean transferRankingAddress(final File crfile, final String address) {
|
|
||||||
// do the transfer
|
|
||||||
final long starttime = System.currentTimeMillis();
|
|
||||||
String result = "unknown";
|
|
||||||
try {
|
|
||||||
final byte[] b = FileUtils.read(crfile);
|
|
||||||
result = yacyClient.transfer(address, crfile.getName(), b);
|
|
||||||
if (result == null) {
|
|
||||||
log.logInfo("RankingDistribution - transmitted file " + crfile + " to " + address + " successfully in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds");
|
|
||||||
FileUtils.deletedelete(crfile); // the file is not needed any more locally
|
|
||||||
} else {
|
|
||||||
log.logInfo("RankingDistribution - error transmitting file " + crfile + " to " + address + ": " + result);
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
log.logInfo("RankingDistribution - could not read file " + crfile + ": " + e.getMessage());
|
|
||||||
result = "input file error: " + e.getMessage();
|
|
||||||
}
|
|
||||||
|
|
||||||
// show success
|
|
||||||
return result == null;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,586 +0,0 @@
|
|||||||
// plasmaCRProcess.java
|
|
||||||
// -----------------------
|
|
||||||
// part of YaCy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2005
|
|
||||||
// Created 15.11.2005
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package de.anomic.search.blockrank;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.Iterator;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.data.word.WordReference;
|
|
||||||
import net.yacy.kelondro.index.Index;
|
|
||||||
import net.yacy.kelondro.index.Row;
|
|
||||||
import net.yacy.kelondro.index.RowSet;
|
|
||||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
|
||||||
import net.yacy.kelondro.order.Bitfield;
|
|
||||||
import net.yacy.kelondro.order.CloneableIterator;
|
|
||||||
import net.yacy.kelondro.order.MicroDate;
|
|
||||||
import net.yacy.kelondro.rwi.IndexCell;
|
|
||||||
import net.yacy.kelondro.rwi.ReferenceContainer;
|
|
||||||
import net.yacy.kelondro.table.Table;
|
|
||||||
import net.yacy.kelondro.util.AttrSeq;
|
|
||||||
import net.yacy.kelondro.util.DateFormatter;
|
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
|
||||||
import net.yacy.kelondro.util.MemoryControl;
|
|
||||||
|
|
||||||
import de.anomic.search.Segment;
|
|
||||||
|
|
||||||
public class CRProcess {
|
|
||||||
|
|
||||||
/*
|
|
||||||
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
|
|
||||||
header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10);
|
|
||||||
header.append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-" + ((type.equals("crl")) ? "6" : "12") + ">"); header.append((char) 13); header.append((char) 10);
|
|
||||||
header.append("# ---"); header.append((char) 13); header.append((char) 10);
|
|
||||||
*/
|
|
||||||
|
|
||||||
/*
|
|
||||||
private static final int Col_Referee = 0;
|
|
||||||
private static final int Col_UDate = 1;
|
|
||||||
private static final int Col_VDate = 2;
|
|
||||||
private static final int Col_LCount = 3;
|
|
||||||
private static final int Col_GCount = 4;
|
|
||||||
private static final int Col_ICount = 5;
|
|
||||||
private static final int Col_DCount = 6;
|
|
||||||
private static final int Col_TLength = 7;
|
|
||||||
private static final int Col_WACount = 8;
|
|
||||||
private static final int Col_WUCount = 9;
|
|
||||||
*/
|
|
||||||
private static final int Col_Flags = 10;
|
|
||||||
private static final int Col_FUDate = 11;
|
|
||||||
private static final int Col_FDDate = 12;
|
|
||||||
private static final int Col_LUDate = 13;
|
|
||||||
private static final int Col_UCount = 14;
|
|
||||||
private static final int Col_PCount = 15;
|
|
||||||
private static final int Col_ACount = 16;
|
|
||||||
private static final int Col_VCount = 17;
|
|
||||||
private static final int Col_Vita = 18;
|
|
||||||
|
|
||||||
public static final Row CRG_accrow = new Row(
|
|
||||||
"byte[] Referee-12," +
|
|
||||||
"Cardinal UDate-3 {b64e}, Cardinal VDate-3 {b64e}, " +
|
|
||||||
"Cardinal LCount-2 {b64e}, Cardinal GCount-2 {b64e}, Cardinal ICount-2 {b64e}, Cardinal DCount-2 {b64e}, Cardinal TLength-3 {b64e}, " +
|
|
||||||
"Cardinal WACount-3 {b64e}, Cardinal WUCount-3 {b64e}, Cardinal Flags-1 {b64e}, " +
|
|
||||||
"Cardinal FUDate-3 {b64e}, Cardinal FDDate-3 {b64e}, Cardinal LUDate-3 {b64e}, " +
|
|
||||||
"Cardinal UCount-2 {b64e}, Cardinal PCount-2 {b64e}, Cardinal ACount-2 {b64e}, Cardinal VCount-2 {b64e}, Cardinal Vita-2 {b64e}",
|
|
||||||
Base64Order.enhancedCoder);
|
|
||||||
public static final Row CRG_colrow = new Row("byte[] Anchor-12", Base64Order.enhancedCoder);
|
|
||||||
public static final String CRG_accname = "CRG-a-attr";
|
|
||||||
public static final String CRG_seqname = "CRG-a-coli";
|
|
||||||
public static final Row RCI_coli = new Row("byte[] RefereeDom-6", Base64Order.enhancedCoder);
|
|
||||||
public static final String RCI_colname = "RCI-a-coli";
|
|
||||||
|
|
||||||
private static boolean accumulate_upd(final File f, final AttrSeq acc) {
|
|
||||||
// open file
|
|
||||||
AttrSeq source_cr = null;
|
|
||||||
try {
|
|
||||||
source_cr = new AttrSeq(f, false);
|
|
||||||
} catch (final IOException e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// put elements in accumulator file
|
|
||||||
final Iterator<String> el = source_cr.keys();
|
|
||||||
String key;
|
|
||||||
AttrSeq.Entry new_entry, acc_entry;
|
|
||||||
int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
|
|
||||||
Bitfield acc_flags, new_flags;
|
|
||||||
while (el.hasNext()) {
|
|
||||||
key = el.next();
|
|
||||||
new_entry = source_cr.getEntry(key);
|
|
||||||
new_flags = new Bitfield(Base64Order.enhancedCoder.encodeLong(new_entry.getAttr("Flags", 0), 1).getBytes());
|
|
||||||
// enrich information with additional values
|
|
||||||
if ((acc_entry = acc.getEntry(key)) != null) {
|
|
||||||
FUDate = (int) acc_entry.getAttr("FUDate", 0);
|
|
||||||
FDDate = (int) acc_entry.getAttr("FDDate", 0);
|
|
||||||
LUDate = (int) acc_entry.getAttr("LUDate", 0);
|
|
||||||
UCount = (int) acc_entry.getAttr("UCount", 0);
|
|
||||||
PCount = (int) acc_entry.getAttr("PCount", 0);
|
|
||||||
ACount = (int) acc_entry.getAttr("ACount", 0);
|
|
||||||
VCount = (int) acc_entry.getAttr("VCount", 0);
|
|
||||||
Vita = (int) acc_entry.getAttr("Vita", 0);
|
|
||||||
|
|
||||||
// update counters and dates
|
|
||||||
acc_entry.setSeq(new_entry.getSeqSet()); // need to be checked
|
|
||||||
|
|
||||||
UCount++; // increase update counter
|
|
||||||
PCount += (new_flags.get(1)) ? 1 : 0;
|
|
||||||
ACount += (new_flags.get(2)) ? 1 : 0;
|
|
||||||
VCount += (new_flags.get(3)) ? 1 : 0;
|
|
||||||
|
|
||||||
// 'OR' the flags
|
|
||||||
acc_flags = new Bitfield(Base64Order.enhancedCoder.encodeLong(acc_entry.getAttr("Flags", 0), 1).getBytes());
|
|
||||||
for (int i = 0; i < 6; i++) {
|
|
||||||
if (new_flags.get(i)) acc_flags.set(i, true);
|
|
||||||
}
|
|
||||||
acc_entry.setAttr("Flags", (int) Base64Order.enhancedCoder.decodeLong(acc_flags.exportB64()));
|
|
||||||
} else {
|
|
||||||
// initialize counters and dates
|
|
||||||
acc_entry = acc.newEntry(key, new_entry.getAttrs(), new_entry.getSeqSet());
|
|
||||||
FUDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date
|
|
||||||
FDDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
|
|
||||||
LUDate = (int) new_entry.getAttr("VDate", 0);
|
|
||||||
UCount = 0;
|
|
||||||
PCount = (new_flags.get(1)) ? 1 : 0;
|
|
||||||
ACount = (new_flags.get(2)) ? 1 : 0;
|
|
||||||
VCount = (new_flags.get(3)) ? 1 : 0;
|
|
||||||
Vita = 0;
|
|
||||||
}
|
|
||||||
// make plausibility check?
|
|
||||||
|
|
||||||
// insert into accumulator
|
|
||||||
acc_entry.setAttr("FUDate", FUDate);
|
|
||||||
acc_entry.setAttr("FDDate", FDDate);
|
|
||||||
acc_entry.setAttr("LUDate", LUDate);
|
|
||||||
acc_entry.setAttr("UCount", UCount);
|
|
||||||
acc_entry.setAttr("PCount", PCount);
|
|
||||||
acc_entry.setAttr("ACount", ACount);
|
|
||||||
acc_entry.setAttr("VCount", VCount);
|
|
||||||
acc_entry.setAttr("Vita", Vita);
|
|
||||||
acc.putEntrySmall(acc_entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean accumulate_upd(final File f, final Index acc) throws IOException, RowSpaceExceededException {
|
|
||||||
// open file
|
|
||||||
AttrSeq source_cr = null;
|
|
||||||
try {
|
|
||||||
source_cr = new AttrSeq(f, false);
|
|
||||||
} catch (final IOException e) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// put elements in accumulator file
|
|
||||||
final Iterator<String> el = source_cr.keys();
|
|
||||||
String key;
|
|
||||||
AttrSeq.Entry new_entry;
|
|
||||||
Row.Entry acc_entry;
|
|
||||||
int FUDate, FDDate, LUDate, UCount, PCount, ACount, VCount, Vita;
|
|
||||||
Bitfield acc_flags, new_flags;
|
|
||||||
while (el.hasNext()) {
|
|
||||||
key = el.next();
|
|
||||||
new_entry = source_cr.getEntry(key);
|
|
||||||
new_flags = new Bitfield(Base64Order.enhancedCoder.encodeLong(new_entry.getAttr("Flags", 0), 1).getBytes());
|
|
||||||
// enrich information with additional values
|
|
||||||
if ((acc_entry = acc.get(key.getBytes())) != null) {
|
|
||||||
FUDate = (int) acc_entry.getColLong(Col_FUDate);
|
|
||||||
FDDate = (int) acc_entry.getColLong(Col_FDDate);
|
|
||||||
LUDate = (int) acc_entry.getColLong(Col_LUDate);
|
|
||||||
UCount = (int) acc_entry.getColLong(Col_UCount);
|
|
||||||
PCount = (int) acc_entry.getColLong(Col_PCount);
|
|
||||||
ACount = (int) acc_entry.getColLong(Col_ACount);
|
|
||||||
VCount = (int) acc_entry.getColLong(Col_VCount);
|
|
||||||
Vita = (int) acc_entry.getColLong(Col_Vita);
|
|
||||||
|
|
||||||
// update counters and dates
|
|
||||||
//seq.add(key.getBytes(), new_entry.getSeqCollection());
|
|
||||||
|
|
||||||
UCount++; // increase update counter
|
|
||||||
PCount += (new_flags.get(1)) ? 1 : 0;
|
|
||||||
ACount += (new_flags.get(2)) ? 1 : 0;
|
|
||||||
VCount += (new_flags.get(3)) ? 1 : 0;
|
|
||||||
|
|
||||||
// 'OR' the flags
|
|
||||||
acc_flags = new Bitfield(Base64Order.enhancedCoder.encodeLong(acc_entry.getColLong(Col_Flags), 1).getBytes());
|
|
||||||
for (int i = 0; i < 6; i++) {
|
|
||||||
if (new_flags.get(i)) acc_flags.set(i, true);
|
|
||||||
}
|
|
||||||
acc_entry.setCol(Col_Flags, (int) Base64Order.enhancedCoder.decodeLong(acc_flags.exportB64()));
|
|
||||||
} else {
|
|
||||||
// initialize counters and dates
|
|
||||||
acc_entry = acc.row().newEntry();
|
|
||||||
acc_entry.setCol(0, key, null);
|
|
||||||
for (int i = 1; i < acc.row().columns(); i++) {
|
|
||||||
acc_entry.setCol(i, new_entry.getAttr(acc.row().column(i).nickname, 0));
|
|
||||||
}
|
|
||||||
//seq.put(key.getBytes(), new_entry.getSeqCollection());
|
|
||||||
FUDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // first update date
|
|
||||||
FDDate = MicroDate.microDateHoursInt(System.currentTimeMillis()); // very difficult to compute; this is only a quick-hack
|
|
||||||
LUDate = (int) new_entry.getAttr("VDate", 0);
|
|
||||||
UCount = 0;
|
|
||||||
PCount = (new_flags.get(1)) ? 1 : 0;
|
|
||||||
ACount = (new_flags.get(2)) ? 1 : 0;
|
|
||||||
VCount = (new_flags.get(3)) ? 1 : 0;
|
|
||||||
Vita = 0;
|
|
||||||
}
|
|
||||||
// make plausibility check?
|
|
||||||
|
|
||||||
// insert into accumulator
|
|
||||||
acc_entry.setCol(Col_FUDate, FUDate);
|
|
||||||
acc_entry.setCol(Col_FDDate, FDDate);
|
|
||||||
acc_entry.setCol(Col_LUDate, LUDate);
|
|
||||||
acc_entry.setCol(Col_UCount, UCount);
|
|
||||||
acc_entry.setCol(Col_PCount, PCount);
|
|
||||||
acc_entry.setCol(Col_ACount, ACount);
|
|
||||||
acc_entry.setCol(Col_VCount, VCount);
|
|
||||||
acc_entry.setCol(Col_Vita, Vita);
|
|
||||||
acc.put(acc_entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void accumulate(
|
|
||||||
final File from_dir,
|
|
||||||
final File tmp_dir,
|
|
||||||
final File err_dir,
|
|
||||||
final File bkp_dir,
|
|
||||||
final File to_file,
|
|
||||||
int max_files,
|
|
||||||
final boolean newdb) throws IOException, RowSpaceExceededException {
|
|
||||||
if (!(from_dir.isDirectory())) {
|
|
||||||
System.out.println("source path " + from_dir + " is not a directory.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!(tmp_dir.isDirectory())) {
|
|
||||||
System.out.println("temporary path " + tmp_dir + " is not a directory.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!(err_dir.isDirectory())) {
|
|
||||||
System.out.println("error path " + err_dir + " is not a directory.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!(bkp_dir.isDirectory())) {
|
|
||||||
System.out.println("back-up path " + bkp_dir + " is not a directory.");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// open target file
|
|
||||||
AttrSeq acc = null;
|
|
||||||
Index newacc = null;
|
|
||||||
IndexCell<WordReference> newseq = null;
|
|
||||||
if (newdb) {
|
|
||||||
final File path = to_file.getParentFile(); // path to storage place
|
|
||||||
newacc = new Table(new File(path, CRG_accname), CRG_accrow, 0, 0, true, false);
|
|
||||||
newseq = new IndexCell<WordReference>(
|
|
||||||
path,
|
|
||||||
"index",
|
|
||||||
Segment.wordReferenceFactory,
|
|
||||||
Base64Order.enhancedCoder,
|
|
||||||
CRG_colrow,
|
|
||||||
10000, 1000000000L, 20, null, 1000000);
|
|
||||||
} else {
|
|
||||||
if (!(to_file.exists())) {
|
|
||||||
acc = new AttrSeq("Global Ranking Accumulator File",
|
|
||||||
"<Referee-12>,'='," +
|
|
||||||
"<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>," +
|
|
||||||
"<FUDate-3>,<FDDate-3>,<LUDate-3>,<UCount-2>,<PCount-2>,<ACount-2>,<VCount-2>,<Vita-2>," +
|
|
||||||
"'|',*<Anchor-12>", false);
|
|
||||||
acc.toFile(to_file);
|
|
||||||
}
|
|
||||||
acc = new AttrSeq(to_file, false);
|
|
||||||
}
|
|
||||||
// collect source files
|
|
||||||
File source_file = null;
|
|
||||||
final String[] files = from_dir.list();
|
|
||||||
if (files.length < max_files) max_files = files.length;
|
|
||||||
for (int i = 0; i < max_files; i++) {
|
|
||||||
// open file
|
|
||||||
source_file = new File(from_dir, files[i]);
|
|
||||||
if (newdb) {
|
|
||||||
/*
|
|
||||||
if (accumulate_upd(source_file, newacc, newseq)) {
|
|
||||||
// move CR file to temporary folder
|
|
||||||
source_file.renameTo(new File(tmp_dir, files[i]));
|
|
||||||
} else {
|
|
||||||
// error case: the CR-file is not valid; move to error path
|
|
||||||
source_file.renameTo(new File(err_dir, files[i]));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
} else {
|
|
||||||
if (accumulate_upd(source_file, acc)) {
|
|
||||||
// move CR file to temporary folder
|
|
||||||
source_file.renameTo(new File(tmp_dir, files[i]));
|
|
||||||
} else {
|
|
||||||
// error case: the CR-file is not valid; move to error path
|
|
||||||
source_file.renameTo(new File(err_dir, files[i]));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
if (newdb) {
|
|
||||||
newacc.close();
|
|
||||||
newseq.close();
|
|
||||||
} else {
|
|
||||||
// save accumulator to temporary file
|
|
||||||
File tmp_file;
|
|
||||||
if (to_file.toString().endsWith(".gz")) {
|
|
||||||
tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".prt.gz");
|
|
||||||
} else {
|
|
||||||
tmp_file = new File(to_file.toString() + "." + (System.currentTimeMillis() % 1000) + ".prt");
|
|
||||||
}
|
|
||||||
// store the file
|
|
||||||
acc.toFile(tmp_file);
|
|
||||||
// since this was successful, we remove the old file and move the new file to it
|
|
||||||
FileUtils.deletedelete(to_file);
|
|
||||||
tmp_file.renameTo(to_file);
|
|
||||||
}
|
|
||||||
FileUtils.moveAll(tmp_dir, bkp_dir);
|
|
||||||
} catch (final IOException e) {
|
|
||||||
// move previously processed files back
|
|
||||||
Log.logException(e);
|
|
||||||
FileUtils.moveAll(tmp_dir, from_dir);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int genrci(File cr_in, final File rci_out) throws IOException {
|
|
||||||
if (!(cr_in.exists())) return 0;
|
|
||||||
AttrSeq cr = new AttrSeq(cr_in, false);
|
|
||||||
//if (rci_out.exists()) FileUtils.deletedelete(rci_out); // we want only fresh rci here (during testing)
|
|
||||||
if (!(rci_out.exists())) {
|
|
||||||
final AttrSeq rcix = new AttrSeq("Global Ranking Reverse Citation Index",
|
|
||||||
"<AnchorDom-6>,'='," +
|
|
||||||
"<UDate-3>," +
|
|
||||||
"'|',*<Referee-12>", false);
|
|
||||||
rcix.toFile(rci_out);
|
|
||||||
}
|
|
||||||
final AttrSeq rci = new AttrSeq(rci_out, false);
|
|
||||||
|
|
||||||
// loop over all referees
|
|
||||||
int count = 0;
|
|
||||||
final int size = cr.size();
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
long l;
|
|
||||||
final Iterator<String> i = cr.keys();
|
|
||||||
String referee, anchor, anchorDom;
|
|
||||||
AttrSeq.Entry cr_entry, rci_entry;
|
|
||||||
long cr_UDate, rci_UDate;
|
|
||||||
while (i.hasNext()) {
|
|
||||||
referee = i.next();
|
|
||||||
cr_entry = cr.getEntry(referee);
|
|
||||||
cr_UDate = cr_entry.getAttr("UDate", 0);
|
|
||||||
|
|
||||||
// loop over all anchors
|
|
||||||
final Iterator<String> j = cr_entry.getSeqSet().iterator();
|
|
||||||
while (j.hasNext()) {
|
|
||||||
// get domain of anchors
|
|
||||||
anchor = j.next();
|
|
||||||
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
|
|
||||||
|
|
||||||
// update domain-specific entry
|
|
||||||
rci_entry = rci.getEntry(anchorDom);
|
|
||||||
if (rci_entry == null) rci_entry = rci.newEntry(anchorDom, false);
|
|
||||||
rci_entry.addSeq(referee);
|
|
||||||
|
|
||||||
// update Update-Date
|
|
||||||
rci_UDate = rci_entry.getAttr("UDate", 0);
|
|
||||||
if (cr_UDate > rci_UDate) rci_entry.setAttr("UDate", cr_UDate);
|
|
||||||
|
|
||||||
// insert entry
|
|
||||||
rci.putEntry(rci_entry);
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if ((count % 1000) == 0) {
|
|
||||||
l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l)) + " seconds remaining; mem = " + MemoryControl.available());
|
|
||||||
}
|
|
||||||
i.remove();
|
|
||||||
}
|
|
||||||
|
|
||||||
// finished. write to file
|
|
||||||
cr = null;
|
|
||||||
cr_in = null;
|
|
||||||
rci.toFile(rci_out);
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int genrcix(final File cr_path_in, final File rci_path_out) throws IOException, RowSpaceExceededException {
|
|
||||||
//kelondroFlexTable acc = new kelondroFlexTable(cr_path_in, CRG_accname, kelondroBase64Order.enhancedCoder, 128 * 1024 * 1024, -1, CRG_accrow, true);
|
|
||||||
final IndexCell<WordReference> seq = new IndexCell<WordReference>(
|
|
||||||
cr_path_in, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, CRG_colrow, 10000, 1000000000L, 20, null, 1000000);
|
|
||||||
final IndexCell<WordReference> rci = new IndexCell<WordReference>(
|
|
||||||
rci_path_out, "index", Segment.wordReferenceFactory, Base64Order.enhancedCoder, RCI_coli, 10000, 1000000000L, 20, null, 1000000);
|
|
||||||
|
|
||||||
// loop over all referees
|
|
||||||
int count = 0;
|
|
||||||
final int size = seq.size();
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
long l;
|
|
||||||
final CloneableIterator<ReferenceContainer<WordReference>> i = seq.references(null, false);
|
|
||||||
ReferenceContainer<WordReference> keycollection;
|
|
||||||
String referee, refereeDom, anchor, anchorDom;
|
|
||||||
RowSet rci_entry;
|
|
||||||
CloneableIterator<Row.Entry> cr_entry;
|
|
||||||
while (i.hasNext()) {
|
|
||||||
keycollection = i.next();
|
|
||||||
referee = new String(keycollection.getTermHash());
|
|
||||||
if (referee.length() == 6) refereeDom = referee; else refereeDom = referee.substring(6);
|
|
||||||
cr_entry = keycollection.rows();
|
|
||||||
|
|
||||||
// loop over all anchors
|
|
||||||
Row.Entry entry;
|
|
||||||
while (cr_entry.hasNext()) {
|
|
||||||
entry = cr_entry.next();
|
|
||||||
anchor = entry.getColString(0, null);
|
|
||||||
if (anchor.length() == 6) anchorDom = anchor; else anchorDom = anchor.substring(6);
|
|
||||||
|
|
||||||
// update domain-specific entry
|
|
||||||
rci_entry = rci.get(anchorDom.getBytes(), null);
|
|
||||||
if (rci_entry == null) rci_entry = new RowSet(RCI_coli, 0);
|
|
||||||
rci_entry.add(refereeDom.getBytes());
|
|
||||||
|
|
||||||
// insert entry
|
|
||||||
//rci.put(anchorDom.getBytes(), rci_entry);
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if ((count % 1000) == 0) {
|
|
||||||
l = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("processed " + count + " citations, " + (count / l) + " per second, rci.size = " + rci.size() + ", " + ((size - count) / (count / l) / 60) + " minutes remaining; mem = " + MemoryControl.free());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// finished. write to file
|
|
||||||
seq.close();
|
|
||||||
rci.close();
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(final String[] args) {
|
|
||||||
// java -classpath source de.anomic.plasma.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
|
|
||||||
try {
|
|
||||||
if ((args.length == 5) && (args[0].equals("-accumulate"))) {
|
|
||||||
accumulate(new File(args[1]), new File(args[2]), new File(args[3]), new File(args[4]), new File(args[5]), Integer.parseInt(args[6]), true);
|
|
||||||
}
|
|
||||||
if ((args.length == 2) && (args[0].equals("-accumulate"))) {
|
|
||||||
final File root_path = new File(args[1]);
|
|
||||||
final File from_dir = new File(root_path, "DATA/RANKING/GLOBAL/014_othercr");
|
|
||||||
final File ready_dir = new File(root_path, "DATA/RANKING/GLOBAL/015_ready");
|
|
||||||
final File tmp_dir = new File(root_path, "DATA/RANKING/GLOBAL/016_tmp");
|
|
||||||
final File err_dir = new File(root_path, "DATA/RANKING/GLOBAL/017_err");
|
|
||||||
final File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc");
|
|
||||||
final String filename = "CRG-a-" + DateFormatter.formatShortMilliSecond(new Date()) + ".cr.gz";
|
|
||||||
final File to_file = new File(root_path, "DATA/RANKING/GLOBAL/020_con0/" + filename);
|
|
||||||
if (!(ready_dir.exists())) ready_dir.mkdirs();
|
|
||||||
if (!(tmp_dir.exists())) tmp_dir.mkdirs();
|
|
||||||
if (!(err_dir.exists())) err_dir.mkdirs();
|
|
||||||
if (!(acc_dir.exists())) acc_dir.mkdirs();
|
|
||||||
if (!(to_file.getParentFile().exists())) to_file.getParentFile().mkdirs();
|
|
||||||
FileUtils.moveAll(from_dir, ready_dir);
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
final int files = ready_dir.list().length;
|
|
||||||
accumulate(ready_dir, tmp_dir, err_dir, acc_dir, to_file, 1000, true);
|
|
||||||
final long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("Finished accumulate for " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
|
|
||||||
}
|
|
||||||
if ((args.length == 3) && (args[0].equals("-recycle"))) {
|
|
||||||
final File root_path = new File(args[1]);
|
|
||||||
final int max_age_hours = Integer.parseInt(args[2]);
|
|
||||||
final File own_dir = new File(root_path, "DATA/RANKING/GLOBAL/010_owncr");
|
|
||||||
final File acc_dir = new File(root_path, "DATA/RANKING/GLOBAL/018_acc");
|
|
||||||
final File bkp_dir = new File(root_path, "DATA/RANKING/GLOBAL/019_bkp");
|
|
||||||
if (!(own_dir.exists())) return;
|
|
||||||
if (!(acc_dir.exists())) return;
|
|
||||||
if (!(bkp_dir.exists())) bkp_dir.mkdirs();
|
|
||||||
final String[] list = acc_dir.list();
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
final int files = list.length;
|
|
||||||
long d;
|
|
||||||
File f;
|
|
||||||
for (int i = 0; i < list.length; i++) {
|
|
||||||
f = new File(acc_dir, list[i]);
|
|
||||||
try {
|
|
||||||
d = (System.currentTimeMillis() - (new AttrSeq(f, false)).created()) / 3600000;
|
|
||||||
if (d > max_age_hours) {
|
|
||||||
// file is considered to be too old, it is not recycled
|
|
||||||
System.out.println("file " + f.getName() + " is old (" + d + " hours) and not recycled, only moved to backup");
|
|
||||||
f.renameTo(new File(bkp_dir, list[i]));
|
|
||||||
} else {
|
|
||||||
// file is fresh, it is duplicated and moved to be transferred to other peers again
|
|
||||||
System.out.println("file " + f.getName() + " is fresh (" + d + " hours old), recycled and moved to backup");
|
|
||||||
FileUtils.copy(f, new File(own_dir, list[i]));
|
|
||||||
f.renameTo(new File(bkp_dir, list[i]));
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
// there is something wrong with this file; delete it
|
|
||||||
System.out.println("file " + f.getName() + " is corrupted and deleted");
|
|
||||||
FileUtils.deletedelete(f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
final long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("Finished recycling of " + files + " files in " + seconds + " seconds (" + (files / seconds) + " files/second)");
|
|
||||||
}
|
|
||||||
if ((args.length == 2) && (args[0].equals("-genrci"))) {
|
|
||||||
final File root_path = new File(args[1]);
|
|
||||||
final File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0");
|
|
||||||
final File rci_filedir = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0");
|
|
||||||
rci_filedir.mkdirs();
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
final int count = genrcix(cr_filedir, rci_filedir);
|
|
||||||
final long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("Completed RCI generation: " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
if ((args.length == 2) && (args[0].equals("-genrci"))) {
|
|
||||||
File root_path = new File(args[1]);
|
|
||||||
File cr_filedir = new File(root_path, "DATA/RANKING/GLOBAL/020_con0");
|
|
||||||
File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
|
|
||||||
rci_file.getParentFile().mkdirs();
|
|
||||||
String[] cr_filenames = cr_filedir.list();
|
|
||||||
for (int i = 0; i < cr_filenames.length; i++) {
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
int count = genrci(new File(cr_filedir, cr_filenames[i]), rci_file);
|
|
||||||
long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("Completed RCI generation for input file " + cr_filenames[i] + ": " + count + " citation references in " + seconds + " seconds (" + (count / seconds) + " CR-records/second)");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
Class-A File format:
|
|
||||||
|
|
||||||
UDate : latest update timestamp of the URL (as virtual date, hours since epoch)
|
|
||||||
VDate : last visit timestamp of the URL (as virtual date, hours since epoch)
|
|
||||||
LCount : count of links to local resources
|
|
||||||
GCount : count of links to global resources
|
|
||||||
ICount : count of links to images (in document)
|
|
||||||
DCount : count of links to other documents
|
|
||||||
TLength: length of the plain text content (bytes)
|
|
||||||
WACount: total number of all words in content
|
|
||||||
WUCount: number of unique words in content (removed doubles)
|
|
||||||
Flags : Flags (0=update, 1=popularity, 2=attention, 3=vote)
|
|
||||||
|
|
||||||
Class-a File format is an extension of Class-A plus the following attributes
|
|
||||||
FUDate : first update timestamp of the URL
|
|
||||||
FDDate : first update timestamp of the domain
|
|
||||||
LUDate : latest update timestamp of the URL
|
|
||||||
UCount : Update Counter (of 'latest update timestamp')
|
|
||||||
PCount : Popularity Counter (proxy clicks)
|
|
||||||
ACount : Attention Counter (search result clicks)
|
|
||||||
VCount : Votes
|
|
||||||
Vita : Vitality (normed number of updates per time)
|
|
||||||
*/
|
|
||||||
}
|
|
@ -1,238 +0,0 @@
|
|||||||
// plasmaRCIEvaluation.java
|
|
||||||
// -----------------------
|
|
||||||
// part of YaCy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2005
|
|
||||||
// Created 18.11.2005
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package de.anomic.search.blockrank;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
|
||||||
import net.yacy.kelondro.order.Digest;
|
|
||||||
import net.yacy.kelondro.util.AttrSeq;
|
|
||||||
import net.yacy.kelondro.util.FileUtils;
|
|
||||||
|
|
||||||
import de.anomic.search.RankingProcess;
|
|
||||||
|
|
||||||
public class RCIEvaluation {
|
|
||||||
|
|
||||||
public static int[] rcieval(final AttrSeq rci) {
|
|
||||||
// collect information about which entry has how many references
|
|
||||||
// the output is a reference-count:occurrences relation
|
|
||||||
final HashMap<Integer, Integer> counts = new HashMap<Integer, Integer>();
|
|
||||||
final Iterator<String> i = rci.keys();
|
|
||||||
String key;
|
|
||||||
AttrSeq.Entry entry;
|
|
||||||
Integer count_key, count_count;
|
|
||||||
int c, maxcount = 0;
|
|
||||||
while (i.hasNext()) {
|
|
||||||
key = i.next();
|
|
||||||
entry = rci.getEntry(key);
|
|
||||||
c = entry.getSeqSet().size();
|
|
||||||
if (c > maxcount) maxcount = c;
|
|
||||||
count_key = Integer.valueOf(c);
|
|
||||||
count_count = counts.get(count_key);
|
|
||||||
if (count_count == null) {
|
|
||||||
count_count = 1;
|
|
||||||
} else {
|
|
||||||
count_count = Integer.valueOf(count_count.intValue() + 1);
|
|
||||||
}
|
|
||||||
counts.put(count_key, count_count);
|
|
||||||
}
|
|
||||||
final int[] ctable = new int[maxcount + 1];
|
|
||||||
for (int j = 0; j <= maxcount; j++) {
|
|
||||||
count_count = counts.get(Integer.valueOf(j));
|
|
||||||
if (count_count == null) {
|
|
||||||
ctable[j] = 0;
|
|
||||||
} else {
|
|
||||||
ctable[j] = count_count.intValue();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return ctable;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static long sum(final int[] c) {
|
|
||||||
long s = 0;
|
|
||||||
for (int i = 0; i < c.length; i++) s += c[i];
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int[] interval(final int[] counts, final int parts) {
|
|
||||||
long limit = sum(counts) / 2;
|
|
||||||
final int[] partition = new int[parts];
|
|
||||||
int s = 0, p = parts - 1;
|
|
||||||
for (int i = 1; i < counts.length; i++) {
|
|
||||||
s += counts[i];
|
|
||||||
if ((s > limit) && (p >= 0)) {
|
|
||||||
partition[p--] = i;
|
|
||||||
limit = (2 * limit - s) / 2;
|
|
||||||
s = 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
partition[0] = counts.length - 1;
|
|
||||||
for (int i = 1; i < 10; i++) partition[i] = (partition[i - 1] + 4 * partition[i]) / 5;
|
|
||||||
return partition;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void checkPartitionTable0(final int[] counts, final int[] partition) {
|
|
||||||
int sumsum = 0;
|
|
||||||
int sum;
|
|
||||||
int j = 0;
|
|
||||||
for (int i = partition.length - 1; i >= 0; i--) {
|
|
||||||
sum = 0;
|
|
||||||
while (j <= partition[i]) {
|
|
||||||
sum += counts[j++];
|
|
||||||
}
|
|
||||||
System.out.println("sum of YBR-" + i + " entries: " + sum);
|
|
||||||
sumsum += sum;
|
|
||||||
}
|
|
||||||
System.out.println("complete sum = " + sumsum);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void checkPartitionTable1(final int[] counts, final int[] partition) {
|
|
||||||
int sumsum = 0;
|
|
||||||
final int[] sum = new int[partition.length];
|
|
||||||
for (int i = 0; i < partition.length; i++) sum[i] = 0;
|
|
||||||
for (int i = 0; i < counts.length; i++) sum[orderIntoYBI(partition, i)] += counts[i];
|
|
||||||
for (int i = partition.length - 1; i >= 0; i--) {
|
|
||||||
System.out.println("sum of YBR-" + i + " entries: " + sum[i]);
|
|
||||||
sumsum += sum[i];
|
|
||||||
}
|
|
||||||
System.out.println("complete sum = " + sumsum);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int orderIntoYBI(final int[] partition, final int count) {
|
|
||||||
for (int i = 0; i < partition.length - 1; i++) {
|
|
||||||
if ((count >= (partition[i + 1] + 1)) && (count <= partition[i])) return i;
|
|
||||||
}
|
|
||||||
return partition.length - 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public static TreeSet<byte[]>[] genRankingTable(final AttrSeq rci, final int[] partition) {
|
|
||||||
final TreeSet<byte[]>[] ranked = new TreeSet[partition.length];
|
|
||||||
for (int i = 0; i < partition.length; i++) ranked[i] = new TreeSet<byte[]>(Base64Order.enhancedCoder);
|
|
||||||
final Iterator<String> i = rci.keys();
|
|
||||||
String key;
|
|
||||||
AttrSeq.Entry entry;
|
|
||||||
while (i.hasNext()) {
|
|
||||||
key = i.next();
|
|
||||||
entry = rci.getEntry(key);
|
|
||||||
ranked[orderIntoYBI(partition, entry.getSeqSet().size())].add(key.getBytes());
|
|
||||||
}
|
|
||||||
return ranked;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static HashMap<String, String> genReverseDomHash(final File domlist) {
|
|
||||||
final HashSet<String> domset = FileUtils.loadList(domlist);
|
|
||||||
final HashMap<String, String> dommap = new HashMap<String, String>();
|
|
||||||
final Iterator<String> i = domset.iterator();
|
|
||||||
String dom;
|
|
||||||
while (i.hasNext()) {
|
|
||||||
dom = i.next();
|
|
||||||
if (dom.startsWith("www.")) dom = dom.substring(4);
|
|
||||||
try {
|
|
||||||
dommap.put(new String((new DigestURI("http://" + dom)).hash(), 6, 6), dom);
|
|
||||||
dommap.put(new String((new DigestURI("http://www." + dom)).hash(), 6, 6), "www." + dom);
|
|
||||||
} catch (final MalformedURLException e) {}
|
|
||||||
}
|
|
||||||
return dommap;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void storeRankingTable(final TreeSet<byte[]>[] ranking, final File tablePath) throws IOException {
|
|
||||||
String filename;
|
|
||||||
if (!(tablePath.exists())) tablePath.mkdirs();
|
|
||||||
for (int i = 0; i < ranking.length - 1; i++) {
|
|
||||||
filename = "YBR-4-" + Digest.encodeHex(i, 2) + ".idx";
|
|
||||||
FileUtils.saveSet(new File(tablePath, filename), "plain", ranking[i], "");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(final String[] args) {
|
|
||||||
try {
|
|
||||||
if ((args.length == 2) && (args[0].equals("-genybr"))) {
|
|
||||||
final File root_path = new File(args[1]);
|
|
||||||
final File rci_file = new File(root_path, "DATA/RANKING/GLOBAL/030_rci0/RCI-0.rci.gz");
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
if (!(rci_file.exists())) return;
|
|
||||||
|
|
||||||
// create partition table
|
|
||||||
final AttrSeq rci = new AttrSeq(rci_file, false);
|
|
||||||
final int counts[] = rcieval(rci);
|
|
||||||
final int[] partition = interval(counts, 16);
|
|
||||||
|
|
||||||
// check the table
|
|
||||||
System.out.println("partition position table:");
|
|
||||||
for (int i = 0; i < partition.length - 1; i++) {
|
|
||||||
System.out.println("YBR-" + i + ": " + (partition[i + 1] + 1) + " - " + partition[i] + " references");
|
|
||||||
}
|
|
||||||
System.out.println("YBR-" + (partition.length - 1) + ": 0 - " + partition[partition.length - 1] + " references");
|
|
||||||
checkPartitionTable0(counts, partition);
|
|
||||||
checkPartitionTable1(counts, partition);
|
|
||||||
int sum = 0;
|
|
||||||
for (int i = 0; i < counts.length; i++) sum += counts[i];
|
|
||||||
System.out.println("sum of all references: " + sum);
|
|
||||||
|
|
||||||
// create ranking
|
|
||||||
final TreeSet<byte[]>[] ranked = genRankingTable(rci, partition);
|
|
||||||
storeRankingTable(ranked, new File(root_path, "ranking/YBR"));
|
|
||||||
final long seconds = java.lang.Math.max(1, (System.currentTimeMillis() - start) / 1000);
|
|
||||||
System.out.println("Finished YBR generation in " + seconds + " seconds.");
|
|
||||||
}
|
|
||||||
if ((args.length == 2) && (args[0].equals("-rcieval"))) {
|
|
||||||
final File root_path = new File(args[1]);
|
|
||||||
|
|
||||||
// load a partition table
|
|
||||||
RankingProcess.loadYBR(new File(root_path, "ranking/YBR"), 16);
|
|
||||||
|
|
||||||
// load domain list and generate hash index for domains
|
|
||||||
final HashMap<String, String> dommap = genReverseDomHash(new File(root_path, "domlist.txt"));
|
|
||||||
|
|
||||||
// print out the table
|
|
||||||
String hash, dom;
|
|
||||||
for (int i = 0; i < 9; i++) {
|
|
||||||
System.out.print("YBR-" + i + ": ");
|
|
||||||
for (int j = 0; j < RankingProcess.ybrTables[i].size(); j++) {
|
|
||||||
hash = new String(RankingProcess.ybrTables[i].get(j));
|
|
||||||
dom = dommap.get(hash);
|
|
||||||
if (dom == null) System.out.print("[" + hash + "], "); else System.out.print(dom + ", ");
|
|
||||||
}
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,460 +0,0 @@
|
|||||||
// kelondroAttrSeq.java
|
|
||||||
// -----------------------
|
|
||||||
// part of YaCy
|
|
||||||
// (C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
// first published on http://www.anomic.de
|
|
||||||
// Frankfurt, Germany, 2005
|
|
||||||
// Created 15.11.2005
|
|
||||||
//
|
|
||||||
// $LastChangedDate: 2005-10-22 15:28:04 +0200 (Sat, 22 Oct 2005) $
|
|
||||||
// $LastChangedRevision: 968 $
|
|
||||||
// $LastChangedBy: theli $
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
package net.yacy.kelondro.util;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.StringTokenizer;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
import java.util.logging.Logger;
|
|
||||||
import java.util.zip.GZIPInputStream;
|
|
||||||
|
|
||||||
import net.yacy.kelondro.index.Column;
|
|
||||||
import net.yacy.kelondro.index.Row;
|
|
||||||
import net.yacy.kelondro.index.RowCollection;
|
|
||||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
|
||||||
|
|
||||||
|
|
||||||
public class AttrSeq {
|
|
||||||
|
|
||||||
// class objects
|
|
||||||
private final File file;
|
|
||||||
private final Map<String, Object> entries; // value may be of type String or of type Entry
|
|
||||||
protected Structure structure;
|
|
||||||
private String name;
|
|
||||||
private long created;
|
|
||||||
|
|
||||||
// optional logger
|
|
||||||
protected Logger theLogger = null;
|
|
||||||
|
|
||||||
public AttrSeq(final File file, final boolean tree) throws IOException {
|
|
||||||
this.file = file;
|
|
||||||
this.structure = null;
|
|
||||||
this.created = -1;
|
|
||||||
this.name = "";
|
|
||||||
this.entries = (tree) ? new TreeMap<String, Object>() : new HashMap<String, Object>();
|
|
||||||
readAttrFile(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
public AttrSeq(final String name, final String struct, final boolean tree) {
|
|
||||||
this.file = null;
|
|
||||||
this.structure = new Structure(struct);
|
|
||||||
this.created = System.currentTimeMillis();
|
|
||||||
this.name = name;
|
|
||||||
this.entries = (tree) ? new TreeMap<String, Object>() : new HashMap<String, Object>();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLogger(final Logger newLogger) {
|
|
||||||
this.theLogger = newLogger;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void logInfo(final String message) {
|
|
||||||
if (this.theLogger == null)
|
|
||||||
System.err.println("ATTRSEQ INFO for file " + this.file + ": " + message);
|
|
||||||
else
|
|
||||||
this.theLogger.info("ATTRSEQ INFO for file " + this.file + ": " + message);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void logWarning(final String message) {
|
|
||||||
if (this.theLogger == null)
|
|
||||||
System.err.println("ATTRSEQ WARNING for file " + this.file + ": " + message);
|
|
||||||
else
|
|
||||||
this.theLogger.warning("ATTRSEQ WARNING for file " + this.file + ": " + message);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void readAttrFile(final File loadfile) throws IOException {
|
|
||||||
BufferedReader br = null;
|
|
||||||
int p;
|
|
||||||
if (loadfile.toString().endsWith(".gz")) {
|
|
||||||
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(loadfile))));
|
|
||||||
} else {
|
|
||||||
br = new BufferedReader(new InputStreamReader(new FileInputStream(loadfile)));
|
|
||||||
}
|
|
||||||
String line, key, oldvalue, newvalue;
|
|
||||||
while ((line = br.readLine()) != null) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() == 0) continue;
|
|
||||||
if (line.charAt(0) == '#') {
|
|
||||||
if (line.startsWith("# Structure=")) {
|
|
||||||
structure = new Structure(line.substring(12));
|
|
||||||
}
|
|
||||||
if (line.startsWith("# Name=")) {
|
|
||||||
name = line.substring(7);
|
|
||||||
}
|
|
||||||
if (line.startsWith("# Created=")) {
|
|
||||||
created = Long.parseLong(line.substring(10));
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if ((p = line.indexOf('=')) > 0) {
|
|
||||||
key = line.substring(0, p).trim();
|
|
||||||
newvalue = line.substring(p + 1).trim();
|
|
||||||
oldvalue = (String) entries.get(key);
|
|
||||||
if (oldvalue != null) {
|
|
||||||
if (newvalue.equals(oldvalue)) {
|
|
||||||
//logWarning("key " + key + ": double occurrence. values are equal. second appearance is ignored");
|
|
||||||
} else {
|
|
||||||
if (newvalue.length() < oldvalue.length()) {
|
|
||||||
if (oldvalue.substring(0, newvalue.length()).equals(newvalue)) {
|
|
||||||
logWarning("key " + key + ": double occurrence. new value is subset of old value. second appearance is ignored");
|
|
||||||
} else {
|
|
||||||
logWarning("key " + key + ": double occurrence. new value is shorter than old value, but not a subsequence. old = " + oldvalue + ", new = " + newvalue);
|
|
||||||
}
|
|
||||||
} else if (newvalue.length() > oldvalue.length()) {
|
|
||||||
if (newvalue.substring(0, oldvalue.length()).equals(oldvalue)) {
|
|
||||||
logWarning("key " + key + ": double occurrence. old value is subset of new value. first appearance is ignored");
|
|
||||||
} else {
|
|
||||||
logWarning("key " + key + ": double occurrence. old value is shorter than new value, but not a subsequence. old = " + oldvalue + ", new = " + newvalue);
|
|
||||||
}
|
|
||||||
entries.put(key, newvalue);
|
|
||||||
} else {
|
|
||||||
logWarning("key " + key + ": double occurrence. old and new value have equal length but are not equal. old = " + oldvalue + ", new = " + newvalue);
|
|
||||||
//entries.put(key, newvalue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
entries.put(key, newvalue);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
br.close();
|
|
||||||
if (structure == null) throw new IOException("file contains no structure tag");
|
|
||||||
if (name == null) throw new IOException("file contains no name tag");
|
|
||||||
if (created == -1) throw new IOException("file contains no created tag");
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return entries.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long created() {
|
|
||||||
return this.created;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void toFile(final File out) throws IOException {
|
|
||||||
// generate header
|
|
||||||
final StringBuilder sb = new StringBuilder(2000);
|
|
||||||
sb.append("# Name="); sb.append(this.name); sb.append((char) 13); sb.append((char) 10);
|
|
||||||
sb.append("# Created="); sb.append(this.created); sb.append((char) 13); sb.append((char) 10);
|
|
||||||
sb.append("# Structure="); sb.append(this.structure.toString()); sb.append((char) 13); sb.append((char) 10);
|
|
||||||
sb.append("# ---"); sb.append((char) 13); sb.append((char) 10);
|
|
||||||
String k;
|
|
||||||
Object v;
|
|
||||||
for (final Map.Entry<String, Object> entry : entries.entrySet()) {
|
|
||||||
k = entry.getKey();
|
|
||||||
v = entry.getValue();
|
|
||||||
sb.append(k); sb.append('=');
|
|
||||||
if (v instanceof String) sb.append((String) v);
|
|
||||||
if (v instanceof Entry) sb.append(((Entry) v).toString());
|
|
||||||
sb.append((char) 13); sb.append((char) 10);
|
|
||||||
}
|
|
||||||
if (out.toString().endsWith(".gz")) {
|
|
||||||
FileUtils.writeAndGZip((new String(sb)).getBytes(), out);
|
|
||||||
} else {
|
|
||||||
FileUtils.copy((new String(sb)).getBytes(), out);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Iterator<String> keys() {
|
|
||||||
return entries.keySet().iterator();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry newEntry(final String pivot, final boolean tree) {
|
|
||||||
return new Entry(pivot, new HashMap<String, Long>(), (tree) ? (Set<String>) new TreeSet<String>() : (Set<String>) new HashSet<String>());
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry newEntry(final String pivot, final Map<String, Long> props, final Set<String> seq) {
|
|
||||||
return new Entry(pivot, props, seq);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
public void putEntry(String pivot, String attrseq) {
|
|
||||||
entries.put(pivot, attrseq);
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
|
|
||||||
public void putEntry(final Entry entry) {
|
|
||||||
if (shortmem())
|
|
||||||
entries.put(entry.pivot, entry.toString());
|
|
||||||
else
|
|
||||||
entries.put(entry.pivot, entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void putEntrySmall(final Entry entry) {
|
|
||||||
entries.put(entry.pivot, entry.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry getEntry(final String pivot) {
|
|
||||||
final Object e = entries.get(pivot);
|
|
||||||
if (e == null) return null;
|
|
||||||
if (e instanceof String) return new Entry(pivot, (String) e, false);
|
|
||||||
if (e instanceof Entry) return (Entry) e;
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry removeEntry(final String pivot) {
|
|
||||||
final Object e = entries.remove(pivot);
|
|
||||||
if (e == null) return null;
|
|
||||||
if (e instanceof String) return new Entry(pivot, (String) e, false);
|
|
||||||
if (e instanceof Entry) return (Entry) e;
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class Structure {
|
|
||||||
|
|
||||||
protected String pivot_name = null;
|
|
||||||
protected int pivot_len = -1;
|
|
||||||
protected String[] prop_names = null;
|
|
||||||
protected int[] prop_len = null, prop_pos = null;
|
|
||||||
protected String[] seq_names = null;
|
|
||||||
protected int[] seq_len = null, seq_pos = null;
|
|
||||||
protected Row seqrow;
|
|
||||||
// example:
|
|
||||||
//# Structure=<pivot-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-12>
|
|
||||||
|
|
||||||
public Structure(String structure) {
|
|
||||||
// parse a structure string
|
|
||||||
|
|
||||||
// parse pivot definition:
|
|
||||||
int p = structure.indexOf(",'='");
|
|
||||||
if (p < 0) return;
|
|
||||||
final String pivot = structure.substring(0, p);
|
|
||||||
structure = structure.substring(p + 5);
|
|
||||||
Column a = new Column(pivot);
|
|
||||||
pivot_name = a.nickname;
|
|
||||||
pivot_len = a.cellwidth;
|
|
||||||
|
|
||||||
// parse property part definition:
|
|
||||||
p = structure.indexOf(",'|'");
|
|
||||||
if (p < 0) return;
|
|
||||||
ArrayList<Column> l = new ArrayList<Column>();
|
|
||||||
final String attr = structure.substring(0, p);
|
|
||||||
String seqs = structure.substring(p + 5);
|
|
||||||
StringTokenizer st = new StringTokenizer(attr, ",");
|
|
||||||
while (st.hasMoreTokens()) {
|
|
||||||
a = new Column(st.nextToken());
|
|
||||||
l.add(a);
|
|
||||||
}
|
|
||||||
prop_names = new String[l.size()];
|
|
||||||
prop_len = new int[l.size()];
|
|
||||||
prop_pos = new int[l.size()];
|
|
||||||
p = 0;
|
|
||||||
for (int i = 0; i < l.size(); i++) {
|
|
||||||
a = l.get(i);
|
|
||||||
prop_names[i] = a.nickname;
|
|
||||||
prop_len[i] = a.cellwidth;
|
|
||||||
prop_pos[i] = p;
|
|
||||||
p += prop_len[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// parse sequence definition:
|
|
||||||
if (seqs.length() > 0 && seqs.charAt(0) == '*') seqs = seqs.substring(1);
|
|
||||||
l = new ArrayList<Column>();
|
|
||||||
st = new StringTokenizer(seqs, ",");
|
|
||||||
while (st.hasMoreTokens()) {
|
|
||||||
a = new Column(st.nextToken());
|
|
||||||
l.add(a);
|
|
||||||
}
|
|
||||||
seq_names = new String[l.size()];
|
|
||||||
seq_len = new int[l.size()];
|
|
||||||
seq_pos = new int[l.size()];
|
|
||||||
p = 0;
|
|
||||||
for (int i = 0; i < l.size(); i++) {
|
|
||||||
a = l.get(i);
|
|
||||||
seq_names[i] = a.nickname;
|
|
||||||
seq_len[i] = a.cellwidth;
|
|
||||||
seq_pos[i] = p;
|
|
||||||
p += seq_len[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// generate rowdef for seq row definition
|
|
||||||
final StringBuilder rowdef = new StringBuilder();
|
|
||||||
rowdef.append("byte[] ");
|
|
||||||
rowdef.append(seq_names[0]);
|
|
||||||
rowdef.append('-');
|
|
||||||
rowdef.append(seq_len[0]);
|
|
||||||
|
|
||||||
for (int i = 1; i < seq_names.length; i++) {
|
|
||||||
rowdef.append(", byte[] ");
|
|
||||||
rowdef.append(seq_names[i]);
|
|
||||||
rowdef.append('-');
|
|
||||||
rowdef.append(seq_len[i]);
|
|
||||||
}
|
|
||||||
seqrow = new Row(new String(rowdef), null);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
final StringBuilder sb = new StringBuilder(100);
|
|
||||||
sb.append('<'); sb.append(pivot_name); sb.append('-'); sb.append(Integer.toString(pivot_len)); sb.append(">,'=',");
|
|
||||||
if (prop_names.length > 0) {
|
|
||||||
for (int i = 0; i < prop_names.length; i++) {
|
|
||||||
sb.append('<'); sb.append(prop_names[i]); sb.append('-'); sb.append(Integer.toString(prop_len[i])); sb.append(">,");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
sb.append("'|'");
|
|
||||||
if (seq_names.length > 0) {
|
|
||||||
for (int i = 0; i < seq_names.length; i++) {
|
|
||||||
sb.append(",<"); sb.append(seq_names[i]); sb.append('-'); sb.append(Integer.toString(seq_len[i])); sb.append('>');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return new String(sb);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public class Entry {
|
|
||||||
String pivot;
|
|
||||||
Map<String, Long> attrs;
|
|
||||||
Set<String> seq;
|
|
||||||
|
|
||||||
public Entry(final String pivot, final Map<String, Long> attrs, final Set<String> seq) {
|
|
||||||
this.pivot = pivot;
|
|
||||||
this.attrs = attrs;
|
|
||||||
this.seq = seq;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Entry(final String pivot, final String attrseq, final boolean tree) {
|
|
||||||
this.pivot = pivot;
|
|
||||||
attrs = new HashMap<String, Long>();
|
|
||||||
seq = (tree) ? (Set<String>) new TreeSet<String>() : (Set<String>) new HashSet<String>();
|
|
||||||
for (int i = 0; i < structure.prop_names.length; i++) {
|
|
||||||
attrs.put(structure.prop_names[i], Long.valueOf(Base64Order.enhancedCoder.decodeLong(attrseq.substring(structure.prop_pos[i], structure.prop_pos[i] + structure.prop_len[i]))));
|
|
||||||
}
|
|
||||||
|
|
||||||
int p = attrseq.indexOf('|') + 1;
|
|
||||||
//long[] seqattrs = new long[structure.seq_names.length - 1];
|
|
||||||
String seqname;
|
|
||||||
while (p + structure.seq_len[0] <= attrseq.length()) {
|
|
||||||
seqname = attrseq.substring(p, p + structure.seq_len[0]);
|
|
||||||
p += structure.seq_len[0];
|
|
||||||
for (int i = 1; i < structure.seq_names.length; i++) {
|
|
||||||
//seqattrs[i - 1] = kelondroBase64Order.enhancedCoder.decodeLong(attrseq.substring(p, p + structure.seq_len[i]));
|
|
||||||
p += structure.seq_len[i];
|
|
||||||
}
|
|
||||||
seq.add(seqname/*, seqattrs*/);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public Map<String, Long> getAttrs() {
|
|
||||||
return attrs;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getAttr(final String key, final long dflt) {
|
|
||||||
final Long i = attrs.get(key);
|
|
||||||
if (i == null) return dflt;
|
|
||||||
return i.longValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setAttr(final String key, final long attr) {
|
|
||||||
attrs.put(key, Long.valueOf(attr));
|
|
||||||
}
|
|
||||||
|
|
||||||
public Set<String> getSeqSet() {
|
|
||||||
return seq;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RowCollection getSeqCollection() throws RowSpaceExceededException {
|
|
||||||
final RowCollection collection = new RowCollection(structure.seqrow, seq.size());
|
|
||||||
final Iterator<String> i = seq.iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
collection.addUnique(structure.seqrow.newEntry(i.next().getBytes()));
|
|
||||||
}
|
|
||||||
return collection;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSeq(final Set<String> seq) {
|
|
||||||
this.seq = seq;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void addSeq(final String s/*, long[] seqattrs*/) {
|
|
||||||
this.seq.add(s/*, seqattrs*/);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
// creates only the attribute field and the sequence, not the pivot
|
|
||||||
final StringBuilder sb = new StringBuilder(100 + structure.seq_len[0] * seq.size());
|
|
||||||
Long val;
|
|
||||||
for (int i = 0; i < structure.prop_names.length; i++) {
|
|
||||||
val = attrs.get(structure.prop_names[i]);
|
|
||||||
sb.append(Base64Order.enhancedCoder.encodeLongSmart((val == null) ? 0 : val.longValue(), structure.prop_len[i]));
|
|
||||||
}
|
|
||||||
sb.append('|');
|
|
||||||
final Iterator<String> q = seq.iterator();
|
|
||||||
//long[] seqattrs;
|
|
||||||
while (q.hasNext()) {
|
|
||||||
sb.append(q.next());
|
|
||||||
//seqattrs = (long[]) entry.getValue();
|
|
||||||
/*
|
|
||||||
for (int i = 1; i < structure.seq_names.length; i++) {
|
|
||||||
sb.append(kelondroBase64Order.enhancedCoder.encodeLong(seqattrs[i - 1], structure.seq_len[i]));
|
|
||||||
}
|
|
||||||
*/
|
|
||||||
}
|
|
||||||
return new String(sb);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean shortmem() {
|
|
||||||
return (MemoryControl.available() < 20000000L);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void transcode(final File from_file, final File to_file) throws IOException {
|
|
||||||
final AttrSeq crp = new AttrSeq(from_file, true);
|
|
||||||
//crp.toFile(new File(args[1]));
|
|
||||||
final AttrSeq cro = new AttrSeq(crp.name + "/Transcoded from " + crp.file.getName(), crp.structure.toString(), true);
|
|
||||||
final Iterator<String> i = crp.entries.keySet().iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
cro.putEntry(crp.getEntry(i.next()));
|
|
||||||
}
|
|
||||||
cro.toFile(to_file);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(final String[] args) {
|
|
||||||
// java -classpath source de.anomic.kelondro.kelondroPropFile -transcode DATA/RANKING/GLOBAL/CRG-test-unsorted-original.cr DATA/RANKING/GLOBAL/CRG-test-generated.cr
|
|
||||||
try {
|
|
||||||
if ((args.length == 3) && ("-transcode".equals(args[0]))) {
|
|
||||||
transcode(new File(args[1]), new File(args[2]));
|
|
||||||
}
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Loading…
Reference in new issue