You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/de/anomic/yacy/yacySeed.java

861 lines
36 KiB

// yacySeed.java
// -------------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//
// YACY stands for Yet Another CYberspace
//
// the yacySeed Object is the object that bundles and carries all information about
// a single peer in the yacy space.
// The yacySeed object is carried along peers using a string representation, that can
// be compressed and/or scrambled, depending on the purpose of the process.
//
// the yacy status
// any value that is defined here will be overwritten each time the proxy is started
// to prevent that the system gets confused, it should be set to "" which means
// undefined. Other status' that can be reached at run-time are
// junior - a peer that has no public socket, thus cannot be reached on demand
// senior - a peer that has a public socked and serves search queries
// principal - a peer like a senior socket and serves as gateway for network definition
package de.anomic.yacy;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import net.yacy.cora.date.AbstractFormatter;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.protocol.Domains;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Digest;
import net.yacy.kelondro.util.MapTools;
import de.anomic.tools.bitfield;
import de.anomic.tools.crypt;
replaced old DHT transmission method with new method. Many things have changed! some of them: - after a index selection is made, the index is splitted into its vertical components - from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue - each splitted chunk gets its own transmission thread - multiple transmission threads are started concurrently - the process can be monitored with the blocking queue servlet To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed. The new index distribution model using a vertical DHT was implemented. An abstraction of this model is implemented in the new dht package as interface. The freeworld network has now a configuration of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free. This modification has three main targets: - enhance the DHT transmission speed - with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times. - the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before. with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs. BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
import de.anomic.yacy.dht.FlatWordPartitionScheme;
public class yacySeed implements Cloneable {
public static final int maxsize = 16000;
/**
* <b>substance</b> "sI" (send index/words)
*/
public static final String INDEX_OUT = "sI";
/**
* <b>substance</b> "rI" (received index/words)
*/
public static final String INDEX_IN = "rI";
/**
* <b>substance</b> "sU" (send URLs)
*/
public static final String URL_OUT = "sU";
/**
* <b>substance</b> "rU" (received URLs)
*/
public static final String URL_IN = "rU";
/**
* <b>substance</b> "virgin"
*/
public static final String PEERTYPE_VIRGIN = "virgin";
/**
* <b>substance</b> "junior"
*/
public static final String PEERTYPE_JUNIOR = "junior";
/**
* <b>substance</b> "senior"
*/
public static final String PEERTYPE_SENIOR = "senior";
/**
* <b>substance</b> "principal"
*/
public static final String PEERTYPE_PRINCIPAL = "principal";
/**
* <b>substance</b> "PeerType"
*/
public static final String PEERTYPE = "PeerType";
/** static/dynamic (if the IP changes often for any reason) */
private static final String IPTYPE = "IPType";
private static final String FLAGS = "Flags";
private static final String FLAGSZERO = "____";
/** the applications version */
public static final String VERSION = "Version";
public static final String YOURTYPE = "yourtype";
public static final String LASTSEEN = "LastSeen";
private static final String USPEED = "USpeed";
/** the name of the peer (user-set) */
public static final String NAME = "Name";
private static final String HASH = "Hash";
/** Birthday - first startup */
private static final String BDATE = "BDate";
/** UTC-Offset */
public static final String UTC = "UTC";
private static final String PEERTAGS = "Tags";
/** the speed of indexing (pages/minute) of the peer */
public static final String ISPEED = "ISpeed";
/** the speed of retrieval (queries/minute) of the peer */
public static final String RSPEED = "RSpeed";
/** the number of minutes that the peer is up in minutes/day (moving average MA30) */
public static final String UPTIME = "Uptime";
/** the number of links that the peer has stored (LURL's) */
public static final String LCOUNT = "LCount";
/** the number of links that the peer has noticed, but not loaded (NURL's) */
public static final String NCOUNT = "NCount";
/** the number of links that the peer provides for remote crawls (ZURL's) */
public static final String RCOUNT = "RCount";
/** the number of different words the peer has indexed */
public static final String ICOUNT = "ICount";
/** the number of seeds that the peer has stored */
public static final String SCOUNT = "SCount";
/** the number of clients that the peer connects (connects/hour as double) */
public static final String CCOUNT = "CCount";
public static final String IP = "IP";
public static final String PORT = "Port";
public static final String SEEDLISTURL = "seedURL";
/** zero-value */
private static final String ZERO = "0";
private static final int FLAG_DIRECT_CONNECT = 0;
private static final int FLAG_ACCEPT_REMOTE_CRAWL = 1;
private static final int FLAG_ACCEPT_REMOTE_INDEX = 2;
public static final String DFLT_NETWORK_UNIT = "freeworld";
public static final String DFLT_NETWORK_GROUP = "";
private static final Random random = new Random(System.currentTimeMillis());
// class variables
/** the peer-hash */
public String hash;
/** a set of identity founding values, eg. IP, name of the peer, YaCy-version, ...*/
private final ConcurrentHashMap<String, String> dna;
protected int selectscore = -1; // only for debugging
private String alternativeIP = null;
public yacySeed(final String theHash, final ConcurrentHashMap<String, String> theDna) {
// create a seed with a pre-defined hash map
assert theHash != null;
this.hash = theHash;
this.dna = theDna;
final String flags = this.dna.get(yacySeed.FLAGS);
if ((flags == null) || (flags.length() != 4)) { this.dna.put(yacySeed.FLAGS, yacySeed.FLAGSZERO); }
this.dna.put(yacySeed.NAME, checkPeerName(get(yacySeed.NAME, "&empty;")));
}
private yacySeed(final String theHash) {
this.dna = new ConcurrentHashMap<String, String>();
// settings that can only be computed by originating peer:
// at first startup -
this.hash = theHash; // the hash key of the peer - very important. should be static somehow, even after restart
this.dna.put(yacySeed.NAME, "&empty;");
this.dna.put(yacySeed.BDATE, "&empty;");
this.dna.put(yacySeed.UTC, "+0000");
// later during operation -
this.dna.put(yacySeed.ISPEED, yacySeed.ZERO);
this.dna.put(yacySeed.RSPEED, yacySeed.ZERO);
this.dna.put(yacySeed.UPTIME, yacySeed.ZERO);
this.dna.put(yacySeed.LCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.NCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.RCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.ICOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.SCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.CCOUNT, yacySeed.ZERO);
this.dna.put(yacySeed.VERSION, yacySeed.ZERO);
// settings that is created during the 'hello' phase - in first contact
this.dna.put(yacySeed.IP, ""); // 123.234.345.456
this.dna.put(yacySeed.PORT, "&empty;");
this.dna.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); // virgin/junior/senior/principal
this.dna.put(yacySeed.IPTYPE, "&empty;");
// settings that can only be computed by visiting peer
this.dna.put(yacySeed.LASTSEEN, GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date(System.currentTimeMillis() /*- DateFormatter.UTCDiff()*/))); // for last-seen date
this.dna.put(yacySeed.USPEED, yacySeed.ZERO); // the computated uplink speed of the peer
// settings that are needed to organize the seed round-trip
this.dna.put(yacySeed.FLAGS, yacySeed.FLAGSZERO);
setFlagDirectConnect(false);
setFlagAcceptRemoteCrawl(true);
setFlagAcceptRemoteIndex(true);
setUnusedFlags();
// index transfer
this.dna.put(yacySeed.INDEX_OUT, yacySeed.ZERO); // send index
this.dna.put(yacySeed.INDEX_IN, yacySeed.ZERO); // received index
this.dna.put(yacySeed.URL_OUT, yacySeed.ZERO); // send URLs
this.dna.put(yacySeed.URL_IN, yacySeed.ZERO); // received URLs
}
/**
* check the peer name: protect against usage as XSS hack
* @param name
* @return a checked name without "<" and ">"
*/
final static Pattern ltp = Pattern.compile("<");
final static Pattern gtp = Pattern.compile(">");
private static String checkPeerName(String name) {
name = ltp.matcher(name).replaceAll("_");
name = gtp.matcher(name).replaceAll("_");
return name;
}
/**
* Checks for the static fragments of a generated default peer name, such as the string 'dpn'
* @see #makeDefaultPeerName()
* @param name the peer name to check for default peer name compliance
* @return whether the given peer name may be a default generated peer name
*/
public static boolean isDefaultPeerName(final String name) {
return (name != null &&
name.length() > 10 &&
name.charAt(0) <= '9' &&
name.charAt(name.length() - 1) <= '9' &&
name.indexOf("dpn") > 0);
}
/**
* used when doing routing within a cluster; this can assign a ip and a port
* that is used instead the address stored in the seed DNA
*/
public void setAlternativeAddress(final String ipport) {
if (ipport == null) return;
final int p = ipport.indexOf(':');
if (p < 0) this.alternativeIP = ipport; else this.alternativeIP = ipport.substring(0, p);
}
/**
* try to get the IP<br>
* @return the IP or null
*/
public final String getIP() {
String ip = get(yacySeed.IP, "localhost");
return (ip == null || ip.length() == 0) ? "localhost" : ip;
}
/**
* try to get the peertype<br>
* @return the peertype or null
*/
public final String getPeerType() { return get(yacySeed.PEERTYPE, ""); }
/**
* try to get the peertype<br>
* @return the peertype or "virgin"
*/
public final String orVirgin() { return get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN); }
/**
* try to get the peertype<br>
* @return the peertype or "junior"
*/
public final String orJunior() { return get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_JUNIOR); }
/**
* try to get the peertype<br>
* @return the peertype or "senior"
*/
public final String orSenior() { return get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_SENIOR); }
/**
* try to get the peertype<br>
* @return the peertype or "principal"
*/
public final String orPrincipal() { return get(yacySeed.PEERTYPE, yacySeed.PEERTYPE_PRINCIPAL); }
/**
* Get a value from the peer's DNA (its set of peer defining values, e.g. IP, name, version, ...)
* @param key the key for the value to fetch
* @param dflt the default value
*/
public final String get(final String key, final String dflt) {
final Object o = this.dna.get(key);
if (o == null) { return dflt; }
return (String) o;
}
public final long getLong(final String key, final long dflt) {
final Object o = this.dna.get(key);
if (o == null) { return dflt; }
if (o instanceof String) try {
return Long.parseLong((String) o);
} catch (final NumberFormatException e) {
return dflt;
} else if (o instanceof Long) {
return ((Long) o).longValue();
} else if (o instanceof Integer) {
return ((Integer) o).intValue();
} else return dflt;
}
public final void setIP(final String ip) { dna.put(yacySeed.IP, ip); }
public final void setPort(final String port) { dna.put(yacySeed.PORT, port); }
public final void setType(final String type) { dna.put(yacySeed.PEERTYPE, type); }
public final void setJunior() { dna.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_JUNIOR); }
public final void setSenior() { dna.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_SENIOR); }
public final void setPrincipal() { dna.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_PRINCIPAL); }
public final void put(final String key, final String value) {
synchronized (this.dna) {
this.dna.put(key, value);
}
}
/** @return the DNA-map of this peer */
public final Map<String, String> getMap() {
return this.dna;
}
public final String getName() {
return checkPeerName(get(yacySeed.NAME, "&empty;"));
}
public final String getHexHash() {
return b64Hash2hexHash(this.hash);
}
public final void incSI(final int count) {
String v = this.dna.get(yacySeed.INDEX_OUT);
if (v == null) { v = yacySeed.ZERO; }
dna.put(yacySeed.INDEX_OUT, Long.toString(Long.parseLong(v) + (long) count));
}
public final void incRI(final int count) {
String v = this.dna.get(yacySeed.INDEX_IN);
if (v == null) { v = yacySeed.ZERO; }
dna.put(yacySeed.INDEX_IN, Long.toString(Long.parseLong(v) + (long) count));
}
public final void incSU(final int count) {
String v = this.dna.get(yacySeed.URL_OUT);
if (v == null) { v = yacySeed.ZERO; }
dna.put(yacySeed.URL_OUT, Long.toString(Long.parseLong(v) + (long) count));
}
public final void incRU(final int count) {
String v = this.dna.get(yacySeed.URL_IN);
if (v == null) { v = yacySeed.ZERO; }
dna.put(yacySeed.URL_IN, Long.toString(Long.parseLong(v) + (long) count));
}
public final void resetCounters(){
dna.put(yacySeed.INDEX_OUT, yacySeed.ZERO);
dna.put(yacySeed.INDEX_IN, yacySeed.ZERO);
dna.put(yacySeed.URL_OUT, yacySeed.ZERO);
dna.put(yacySeed.URL_IN, yacySeed.ZERO);
}
/**
* <code>12 * 6 bit = 72 bit = 24</code> characters octal-hash
* <p>Octal hashes are used for cache-dumps that are DHT-ready</p>
* <p>
* Cause: the natural order of octal hashes are the same as the b64-order of b64Hashes.
* a hexhash cannot be used in such cases, and b64Hashes are not appropriate for file names
* </p>
* @param b64Hash a base64 hash
* @return the octal representation of the given base64 hash
*/
public static String b64Hash2octalHash(final String b64Hash) {
return Digest.encodeOctal(Base64Order.enhancedCoder.decode(b64Hash));
}
/**
* <code>12 * 6 bit = 72 bit = 18</code> characters hex-hash
* @param b64Hash a base64 hash
* @return the hexadecimal representation of the given base64 hash
*/
public static String b64Hash2hexHash(final String b64Hash) {
// the hash string represents 12 * 6 bit = 72 bits. This is too much for a long integer.
return Digest.encodeHex(Base64Order.enhancedCoder.decode(b64Hash));
}
/**
* @param hexHash a hexadecimal hash
* @return the base64 representation of the given hex hash
*/
public static String hexHash2b64Hash(final String hexHash) {
return Base64Order.enhancedCoder.encode(Digest.decodeHex(hexHash));
}
/**
* The returned version follows this pattern: <code>MAJORVERSION . MINORVERSION 0 SVN REVISION</code>
* @return the YaCy version of this peer as a float or <code>0</code> if no valid value could be retrieved
* from this yacySeed object
*/
public final float getVersion() {
try {
return Float.parseFloat(get(yacySeed.VERSION, yacySeed.ZERO));
} catch (final NumberFormatException e) {
return 0;
}
}
/**
* @return the public address of the peer as IP:port string or <code>null</code> if no valid values for
* either the IP or the port could be retrieved from this yacySeed object
*/
public final String getPublicAddress() {
String ip = this.getIP();
if (ip == null || ip.length() < 8) ip = "localhost";
// if (ip.equals(yacyCore.seedDB.mySeed.dna.get(yacySeed.IP))) ip = "127.0.0.1";
// if (this.hash.equals("xxxxxxxxxxxx")) return "192.168.100.1:3300";
final String port = this.dna.get(yacySeed.PORT);
if ((port == null) || (port.length() < 2)) return null;
return ip + ":" + port;
}
/**
* If this seed is part of a cluster, the peer has probably the {@linkplain #alternativeIP} object set to
* a local IP. If this is present and the public IP of this peer is identical to the public IP of the own seed,
* construct an address using this IP; otherwise return the public address
* @see #getPublicAddress()
* @return the alternative IP:port if present, else the public address
*/
public final String getClusterAddress() {
if (this.alternativeIP == null) return getPublicAddress();
final String port = this.dna.get(yacySeed.PORT);
if ((port == null) || (port.length() < 2)) return null;
return this.alternativeIP + ":" + port;
}
/**
* @return the IP address of the peer represented by this yacySeed object as {@link InetAddress}
*/
public final InetAddress getInetAddress() {
return Domains.dnsResolve(this.getIP());
}
/** @return the portnumber of this seed or <code>-1</code> if not present */
public final int getPort() {
final String port = this.dna.get(yacySeed.PORT);
if (port == null) return -1;
/*if (port.length() < 2) return -1; It is possible to use port 0-9*/
return Integer.parseInt(port);
}
/** puts the current time into the lastseen field and cares about the time differential to UTC */
public final void setLastSeenUTC() {
// because java thinks it must apply the UTC offset to the current time,
// to create a string that looks like our current time, it adds the local UTC offset to the
// time. To create a corrected UTC Date string, we first subtract the local UTC offset.
String ls = GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date(System.currentTimeMillis() /*- DateFormatter.UTCDiff()*/));
//System.out.println("SETTING LAST-SEEN of " + this.getName() + " to " + ls);
dna.put(yacySeed.LASTSEEN, ls );
}
/**
* @return the last seen time converted to UTC in milliseconds
*/
public final long getLastSeenUTC() {
try {
final long t = GenericFormatter.SHORT_SECOND_FORMATTER.parse(get(yacySeed.LASTSEEN, "20040101000000")).getTime();
// getTime creates a UTC time number. But in this case java thinks, that the given
// time string is a local time, which has a local UTC offset applied.
// Therefore java subtracts the local UTC offset, to get a UTC number.
// But the given time string is already in UTC time, so the subtraction
// of the local UTC offset is wrong. We correct this here by adding the local UTC
// offset again.
return t /*+ DateFormatter.UTCDiff()*/;
} catch (final java.text.ParseException e) { // in case of an error make seed look old!!!
return System.currentTimeMillis() - AbstractFormatter.dayMillis;
} catch (final java.lang.NumberFormatException e) {
return System.currentTimeMillis() - AbstractFormatter.dayMillis;
}
}
/**
* @see #getLastSeenUTC()
* @return the last seen value as string representation in the following format: YearMonthDayHoursMinutesSeconds
* or <code>20040101000000</code> if not present
*/
public final String getLastSeenString() {
return get(yacySeed.LASTSEEN, "20040101000000");
}
/** @return the age of the seed in number of days */
public final int getAge() {
try {
final long t = GenericFormatter.SHORT_SECOND_FORMATTER.parse(get(yacySeed.BDATE, "20040101000000")).getTime();
return (int) ((System.currentTimeMillis() - (t /*- getUTCDiff() + DateFormatter.UTCDiff()*/)) / 1000 / 60 / 60 / 24);
} catch (final java.text.ParseException e) {
return -1;
} catch (final java.lang.NumberFormatException e) {
return -1;
}
}
public void setPeerTags(final Set<String> keys) {
dna.put(PEERTAGS, MapTools.set2string(keys, "|", false));
}
public Set<String> getPeerTags() {
return MapTools.string2set(get(PEERTAGS, "*"), "|");
}
public boolean matchPeerTags(final HandleSet searchHashes) {
final String peertags = get(PEERTAGS, "");
if (peertags.equals("*")) return true;
final Set<String> tags = MapTools.string2set(peertags, "|");
final Iterator<String> i = tags.iterator();
while (i.hasNext()) {
if (searchHashes.has(Word.word2hash(i.next()))) return true;
}
return false;
}
public int getPPM() {
try {
return Integer.parseInt(get(yacySeed.ISPEED, yacySeed.ZERO));
} catch (final NumberFormatException e) {
return 0;
}
}
public double getQPM() {
try {
return Double.parseDouble(get(yacySeed.RSPEED, yacySeed.ZERO));
} catch (final NumberFormatException e) {
return 0d;
}
}
public final long getLinkCount() {
try {
return getLong(yacySeed.LCOUNT, 0);
} catch (final NumberFormatException e) {
return 0;
}
}
public final long getWordCount() {
try {
return getLong(yacySeed.ICOUNT, 0);
} catch (final NumberFormatException e) {
return 0;
}
}
private boolean getFlag(final int flag) {
final String flags = get(yacySeed.FLAGS, yacySeed.FLAGSZERO);
return (new bitfield(flags.getBytes())).get(flag);
}
private void setFlag(final int flag, final boolean value) {
String flags = get(yacySeed.FLAGS, yacySeed.FLAGSZERO);
if (flags.length() != 4) { flags = yacySeed.FLAGSZERO; }
final bitfield f = new bitfield(flags.getBytes());
f.set(flag, value);
dna.put(yacySeed.FLAGS, new String(f.getBytes()));
}
public final void setFlagDirectConnect(final boolean value) { setFlag(FLAG_DIRECT_CONNECT, value); }
public final void setFlagAcceptRemoteCrawl(final boolean value) { setFlag(FLAG_ACCEPT_REMOTE_CRAWL, value); }
public final void setFlagAcceptRemoteIndex(final boolean value) { setFlag(FLAG_ACCEPT_REMOTE_INDEX, value); }
public final boolean getFlagDirectConnect() { return getFlag(0); }
public final boolean getFlagAcceptRemoteCrawl() {
//if (getVersion() < 0.300) return false;
//if (getVersion() < 0.334) return true;
return getFlag(1);
}
public final boolean getFlagAcceptRemoteIndex() {
//if (getVersion() < 0.335) return false;
return getFlag(2);
}
public final void setUnusedFlags() {
for (int i = 4; i < 24; i++) { setFlag(i, true); }
}
public final boolean isType(final String type) {
return get(yacySeed.PEERTYPE, "").equals(type);
}
public final boolean isVirgin() {
return get(yacySeed.PEERTYPE, "").equals(yacySeed.PEERTYPE_VIRGIN);
}
public final boolean isJunior() {
return get(yacySeed.PEERTYPE, "").equals(yacySeed.PEERTYPE_JUNIOR);
}
public final boolean isSenior() {
return get(yacySeed.PEERTYPE, "").equals(yacySeed.PEERTYPE_SENIOR);
}
public final boolean isPrincipal() {
return get(yacySeed.PEERTYPE, "").equals(yacySeed.PEERTYPE_PRINCIPAL);
}
public final boolean isPotential() {
return isVirgin() || isJunior();
}
public final boolean isActive() {
return isSenior() || isPrincipal();
}
public final boolean isOnline() {
return isSenior() || isPrincipal();
}
public final boolean isOnline(final String type) {
return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL);
}
public long nextLong(Random random, long n) {
return Math.abs(random.nextLong()) % n;
}
private static byte[] bestGap(final yacySeedDB seedDB) {
byte[] randomHash = randomHash();
if ((seedDB == null) || (seedDB.sizeConnected() <= 2)) {
// use random hash
return randomHash;
}
// find gaps
final TreeMap<Long, String> gaps = hashGaps(seedDB);
// take one gap; prefer biggest but take also another smaller by chance
String interval = null;
while (!gaps.isEmpty()) {
interval = gaps.remove(gaps.lastKey());
if (random.nextBoolean()) break;
}
if (interval == null) return randomHash();
// find dht position and size of gap
long left = FlatWordPartitionScheme.std.dhtPosition(interval.substring(0, 12).getBytes(), null);
long right = FlatWordPartitionScheme.std.dhtPosition(interval.substring(12).getBytes(), null);
final long gap8 = FlatWordPartitionScheme.dhtDistance(left, right) >> 3; // 1/8 of a gap
long gapx = gap8 + (Math.abs(random.nextLong()) % (6 * gap8));
long gappos = (Long.MAX_VALUE - left >= gapx) ? left + gapx : (left - Long.MAX_VALUE) + gapx;
byte[] computedHash = FlatWordPartitionScheme.positionToHash(gappos);
// the computed hash is the perfect position (modulo gap4 population and gap alternatives)
// this is too tight. The hash must be more randomized. We take only (!) the first two bytes
// of the computed hash and add random bytes at the remaining positions. The first two bytes
// of the hash may have 64*64 = 2^^10 positions, good for over 1 million peers.
byte[] combined = new byte[12];
System.arraycopy(computedHash, 0, combined, 0, 2);
System.arraycopy(randomHash, 2, combined, 2, 10);
// finally check if the hash is already known
while (seedDB.hasConnected(combined) || seedDB.hasDisconnected(combined) || seedDB.hasPotential(combined)) {
// if we are lucky then this loop will never run
combined = randomHash();
}
return combined;
}
private static TreeMap<Long, String> hashGaps(final yacySeedDB seedDB) {
final TreeMap<Long, String>gaps = new TreeMap<Long, String>();
if (seedDB == null) return gaps;
final Iterator<yacySeed> i = seedDB.seedsConnected(true, false, null, (float) 0.0);
long l;
yacySeed s0 = null, s1, first = null;
while (i.hasNext()) {
s1 = i.next();
if (s0 == null) {
s0 = s1;
first = s0;
continue;
}
replaced old DHT transmission method with new method. Many things have changed! some of them: - after a index selection is made, the index is splitted into its vertical components - from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue - each splitted chunk gets its own transmission thread - multiple transmission threads are started concurrently - the process can be monitored with the blocking queue servlet To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed. The new index distribution model using a vertical DHT was implemented. An abstraction of this model is implemented in the new dht package as interface. The freeworld network has now a configuration of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free. This modification has three main targets: - enhance the DHT transmission speed - with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times. - the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before. with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs. BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
l = FlatWordPartitionScheme.dhtDistance(
FlatWordPartitionScheme.std.dhtPosition(s0.hash.getBytes(), null),
FlatWordPartitionScheme.std.dhtPosition(s1.hash.getBytes(), null));
gaps.put(l, s0.hash + s1.hash);
s0 = s1;
}
// compute also the last gap
if ((first != null) && (s0 != null)) {
replaced old DHT transmission method with new method. Many things have changed! some of them: - after a index selection is made, the index is splitted into its vertical components - from differrent index selctions the splitted components can be accumulated before they are placed into the transmission queue - each splitted chunk gets its own transmission thread - multiple transmission threads are started concurrently - the process can be monitored with the blocking queue servlet To implement that, a new package de.anomic.yacy.dht was created. Some old files have been removed. The new index distribution model using a vertical DHT was implemented. An abstraction of this model is implemented in the new dht package as interface. The freeworld network has now a configuration of two vertial partitions; sixteen partitions are planned and will be configured if the process is bug-free. This modification has three main targets: - enhance the DHT transmission speed - with a vertical DHT, a search will speed up. With two partitions, two times. With sixteen, sixteen times. - the vertical DHT will apply a semi-dht for URLs, and peers will receive a fraction of the overall URLs they received before. with two partitions, the fractions will be halve. With sixteen partitions, a 1/16 of the previous number of URLs. BE CAREFULL, THIS IS A MAJOR CODE CHANGE, POSSIBLY FULL OF BUGS AND HARMFUL THINGS. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5586 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago
l = FlatWordPartitionScheme.dhtDistance(
FlatWordPartitionScheme.std.dhtPosition(s0.hash.getBytes(), null),
FlatWordPartitionScheme.std.dhtPosition(first.hash.getBytes(), null));
gaps.put(l, s0.hash + first.hash);
}
return gaps;
}
public static yacySeed genLocalSeed(final yacySeedDB db) {
major step forward to network switching (target is easy switch to intranet or other networks .. and back) This change is inspired by the need to see a network connected to the index it creates in a indexing team. It is not possible to divide the network and the index. Therefore all control files for the network was moved to the network within the INDEX/<network-name> subfolder. The remaining YACYDB is superfluous and can be deleted. The yacyDB and yacyNews data structures are now part of plasmaWordIndex. Therefore all methods, using static access to yacySeedDB had to be rewritten. A special problem had been all the port forwarding methods which had been tightly mixed with seed construction. It was not possible to move the port forwarding functions to the place, meaning and usage of plasmaWordIndex. Therefore the port forwarding had been deleted (I guess nobody used it and it can be simulated by methods outside of YaCy). The mySeed.txt is automatically moved to the current network position. A new effect causes that every network will create a different local seed file, which is ok, since the seed identifies the peer only against the network (it is the purpose of the seed hash to give a peer a location within the DHT). No other functional change has been made. The next steps to enable network switcing are: - shift of crawler tables from PLASMADB into the network (crawls are also network-specific) - possibly shift of plasmaWordIndex code into yacy package (index management is network-specific) - servlet to switch networks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4765 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago
return genLocalSeed(db, 0, null); // an anonymous peer
}
public static yacySeed genLocalSeed(final yacySeedDB db, final int port, final String name) {
major step forward to network switching (target is easy switch to intranet or other networks .. and back) This change is inspired by the need to see a network connected to the index it creates in a indexing team. It is not possible to divide the network and the index. Therefore all control files for the network was moved to the network within the INDEX/<network-name> subfolder. The remaining YACYDB is superfluous and can be deleted. The yacyDB and yacyNews data structures are now part of plasmaWordIndex. Therefore all methods, using static access to yacySeedDB had to be rewritten. A special problem had been all the port forwarding methods which had been tightly mixed with seed construction. It was not possible to move the port forwarding functions to the place, meaning and usage of plasmaWordIndex. Therefore the port forwarding had been deleted (I guess nobody used it and it can be simulated by methods outside of YaCy). The mySeed.txt is automatically moved to the current network position. A new effect causes that every network will create a different local seed file, which is ok, since the seed identifies the peer only against the network (it is the purpose of the seed hash to give a peer a location within the DHT). No other functional change has been made. The next steps to enable network switcing are: - shift of crawler tables from PLASMADB into the network (crawls are also network-specific) - possibly shift of plasmaWordIndex code into yacy package (index management is network-specific) - servlet to switch networks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4765 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago
// generate a seed for the local peer
// this is the birthplace of a seed, that then will start to travel to other peers
final String hashs = new String(bestGap(db));
yacyCore.log.logInfo("init: OWN SEED = " + hashs);
final yacySeed newSeed = new yacySeed(hashs);
// now calculate other information about the host
major step forward to network switching (target is easy switch to intranet or other networks .. and back) This change is inspired by the need to see a network connected to the index it creates in a indexing team. It is not possible to divide the network and the index. Therefore all control files for the network was moved to the network within the INDEX/<network-name> subfolder. The remaining YACYDB is superfluous and can be deleted. The yacyDB and yacyNews data structures are now part of plasmaWordIndex. Therefore all methods, using static access to yacySeedDB had to be rewritten. A special problem had been all the port forwarding methods which had been tightly mixed with seed construction. It was not possible to move the port forwarding functions to the place, meaning and usage of plasmaWordIndex. Therefore the port forwarding had been deleted (I guess nobody used it and it can be simulated by methods outside of YaCy). The mySeed.txt is automatically moved to the current network position. A new effect causes that every network will create a different local seed file, which is ok, since the seed identifies the peer only against the network (it is the purpose of the seed hash to give a peer a location within the DHT). No other functional change has been made. The next steps to enable network switcing are: - shift of crawler tables from PLASMADB into the network (crawls are also network-specific) - possibly shift of plasmaWordIndex code into yacy package (index management is network-specific) - servlet to switch networks git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4765 6c8d7289-2bf4-0310-a012-ef5d649a1542
17 years ago
newSeed.dna.put(yacySeed.NAME, (name) == null ? "anonymous" : name);
newSeed.dna.put(yacySeed.PORT, Integer.toString((port <= 0) ? 8080 : port));
newSeed.dna.put(yacySeed.BDATE, GenericFormatter.SHORT_SECOND_FORMATTER.format(new Date(System.currentTimeMillis() /*- DateFormatter.UTCDiff()*/)) );
newSeed.dna.put(yacySeed.LASTSEEN, newSeed.dna.get(yacySeed.BDATE)); // just as initial setting
newSeed.dna.put(yacySeed.UTC, GenericFormatter.UTCDiffString());
newSeed.dna.put(yacySeed.PEERTYPE, yacySeed.PEERTYPE_VIRGIN);
return newSeed;
}
//public static String randomHash() { return "zLXFf5lTteUv"; } // only for debugging
public static byte[] randomHash() {
final String hash =
Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(random.nextLong()))).substring(0, 6) +
Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(random.nextLong()))).substring(0, 6);
return hash.getBytes();
}
public static yacySeed genRemoteSeed(final String seedStr, final String key, final boolean ownSeed) throws IOException {
// this method is used to convert the external representation of a seed into a seed object
// yacyCore.log.logFinest("genRemoteSeed: seedStr=" + seedStr + " key=" + key);
// check protocol and syntax of seed
if (seedStr == null) throw new IOException("seedStr == null");
if (seedStr.length() == 0) throw new IOException("seedStr.length() == 0");
final String seed = crypt.simpleDecode(seedStr, key);
if (seed == null) throw new IOException("seed == null");
if (seed.length() == 0) throw new IOException("seed.length() == 0");
// extract hash
final ConcurrentHashMap<String, String> dna = MapTools.string2map(seed, ",");
final String hash = dna.remove(yacySeed.HASH);
if (hash == null) throw new IOException("hash == null");
final yacySeed resultSeed = new yacySeed(hash, dna);
// check semantics of content
final String testResult = resultSeed.isProper(ownSeed);
if (testResult != null) throw new IOException("seed is not proper (" + testResult + "): " + resultSeed);
// seed ok
return resultSeed;
}
// TODO: add here IP ranges to accept also intranet networks
public final String isProper(final boolean checkOwnIP) {
// checks if everything is ok with that seed
// check hash
if (this.hash == null) return "hash is null";
if (this.hash.length() != Word.commonHashLength) return "wrong hash length (" + this.hash.length() + ")";
// name
final String peerName = this.dna.get(yacySeed.NAME);
if (peerName == null) return "no peer name given";
dna.put(yacySeed.NAME, checkPeerName(peerName));
// type
final String peerType = this.getPeerType();
if ((peerType == null) ||
!(peerType.equals(yacySeed.PEERTYPE_VIRGIN) || peerType.equals(yacySeed.PEERTYPE_JUNIOR)
|| peerType.equals(yacySeed.PEERTYPE_SENIOR) || peerType.equals(yacySeed.PEERTYPE_PRINCIPAL)))
return "invalid peerType '" + peerType + "'";
// check IP
if (!checkOwnIP) {
// checking of IP is omitted if we read the own seed file
final String ipCheck = isProperIP(this.getIP());
if (ipCheck != null) return ipCheck;
}
// seedURL
final String seedURL = this.dna.get(SEEDLISTURL);
if (seedURL != null && seedURL.length() > 0) {
if (!seedURL.startsWith("http://") && !seedURL.startsWith("https://")) return "wrong protocol for seedURL";
try {
final URL url = new URL(seedURL);
final String host = url.getHost();
if (host.equals("localhost") || host.startsWith("127.") || (host.startsWith("0:0:0:0:0:0:0:1"))) return "seedURL in localhost rejected";
} catch (final MalformedURLException e) {
return "seedURL malformed";
}
}
return null;
}
public static final String isProperIP(final String ipString) {
// returns null if ipString is proper, a string with the cause otherwise
if (ipString == null) return ipString + " -> IP is null";
if (ipString.length() > 0 && ipString.length() < 8) return ipString + " -> IP is too short: ";
InetAddress ip = Domains.dnsResolve(ipString);
if (ip == null) return ipString + " -> IP is not proper"; //this does not work with staticIP
if (ipString.equals("localhost") || ipString.startsWith("127.") || ipString.startsWith("0:0:0:0:0:0:0:1")) return ipString + " - IP for localhost rejected";
return null;
}
public final String toString() {
HashMap<String, String> copymap = new HashMap<String, String>();
copymap.putAll(this.dna);
copymap.put(yacySeed.HASH, this.hash); // set hash into seed code structure
return MapTools.map2string(copymap, ",", true); // generate string representation
}
public final String genSeedStr(final String key) {
// use a default encoding
final String z = this.genSeedStr('z', key);
final String b = this.genSeedStr('b', key);
// the compressed string may be longer that the uncompressed if there is too much overhead for compression meta-info
// take simply that string that is shorter
if (b.length() < z.length()) return b; else return z;
}
public final String genSeedStr(final char method, final String key) {
return crypt.simpleEncode(this.toString(), key, method);
}
public final void save(final File f) throws IOException {
final String out = this.genSeedStr('p', null);
final FileWriter fw = new FileWriter(f);
fw.write(out, 0, out.length());
fw.close();
}
public static yacySeed load(final File f) throws IOException {
final FileReader fr = new FileReader(f);
final char[] b = new char[(int) f.length()];
fr.read(b, 0, b.length);
fr.close();
final yacySeed mySeed = genRemoteSeed(new String(b), null, true);
assert mySeed != null; // in case of an error, an IOException is thrown
mySeed.dna.put(yacySeed.IP, ""); // set own IP as unknown
return mySeed;
}
public final yacySeed clone() {
ConcurrentHashMap<String, String> ndna = new ConcurrentHashMap<String, String>();
ndna.putAll(this.dna);
return new yacySeed(this.hash, ndna);
}
}