fixes for several dht misbehaviours

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@524 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 162b0f744e
commit 7db543a9fa

@ -48,6 +48,7 @@ import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
import java.util.HashMap;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -206,7 +207,33 @@ public class IndexControl_p {
String result; String result;
long starttime = System.currentTimeMillis(); long starttime = System.currentTimeMillis();
indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); indexes[0] = switchboard.wordIndex.getEntity(keyhash, true);
result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.urlPool.loadedURL); // built urlCache
Enumeration urlEnum = indexes[0].elements(true);
HashMap knownURLs = new HashMap();
HashSet unknownURLEntries = new HashSet();
plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry lurl;
while (urlEnum.hasMoreElements()) {
indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement();
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl == null) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
}
}
// now delete all entries that have no url entry
Iterator hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) try {
indexes[0].removeEntry((String) hashIter.next(), false);
} catch (IOException e) {}
// use whats remaining
result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, knownURLs);
prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
try {indexes[0].close();} catch (IOException e) {} try {indexes[0].close();} catch (IOException e) {}
} }

@ -537,7 +537,9 @@ public class plasmaCrawlLURL extends plasmaURL {
",url=" + crypt.simpleEncode(url.toString()) + ",url=" + crypt.simpleEncode(url.toString()) +
",descr=" + crypt.simpleEncode(descr); ",descr=" + crypt.simpleEncode(descr);
} catch (Exception e) { } catch (Exception e) {
serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); //serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
//if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
//if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
//e.printStackTrace(); //e.printStackTrace();
return null; return null;
} }

@ -382,7 +382,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
indexDistribution = new plasmaWordIndexDistribution(urlPool, wordIndex, log, indexDistribution = new plasmaWordIndexDistribution(urlPool, wordIndex, log,
getConfig("allowDistributeIndex", "false").equals("true"), getConfig("allowDistributeIndex", "false").equals("true"),
getConfig("allowDistributeIndexWhileCrawling","false").equals("true")); getConfig("allowDistributeIndexWhileCrawling","false").equals("true"));
indexDistribution.setCounts(100, 1, 3, 8000); indexDistribution.setCounts(150, 1, 3, 10000);
deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null,
new serverInstantThread(indexDistribution, "job", null), 12000); new serverInstantThread(indexDistribution, "job", null), 12000);

@ -6,6 +6,8 @@ import java.io.IOException;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.Vector; import java.util.Vector;
import java.util.Iterator; import java.util.Iterator;
import java.util.HashSet;
import java.util.HashMap;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeed;
@ -133,13 +135,18 @@ public class plasmaWordIndexDistribution {
// collect index // collect index
String startPointHash = yacyCore.seedDB.mySeed.hash; String startPointHash = yacyCore.seedDB.mySeed.hash;
//String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); //String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength);
plasmaWordIndexEntity[] indexEntities = selectTransferIndexes(startPointHash, indexCount); Object[] selectResult = selectTransferIndexes(startPointHash, indexCount);
plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0];
HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry
if ((indexEntities == null) || (indexEntities.length == 0)) { if ((indexEntities == null) || (indexEntities.length == 0)) {
log.logDebug("No index available for index transfer, hash start-point " + startPointHash); log.logDebug("No index available for index transfer, hash start-point " + startPointHash);
return -1; return -1;
} }
// count the indexes again, can be smaller as expected // count the indexes again, can be smaller as expected
indexCount = 0; for (int i = 0; i < indexEntities.length; i++) indexCount += indexEntities[i].size(); indexCount = 0;
for (int i = 0; i < indexEntities.length; i++) {
indexCount += indexEntities[i].size();
}
// find start point for DHT-selection // find start point for DHT-selection
String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes
@ -157,7 +164,7 @@ public class plasmaWordIndexDistribution {
} }
seed = (yacySeed) e.nextElement(); seed = (yacySeed) e.nextElement();
if (seed != null) { if (seed != null) {
error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL); error = yacyClient.transferIndex(seed, indexEntities, urlCache);
if (error == null) { if (error == null) {
log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull");
peerNames += ", " + seed.getName(); peerNames += ", " + seed.getName();
@ -176,7 +183,7 @@ public class plasmaWordIndexDistribution {
if (delete) { if (delete) {
try { try {
if (deleteTransferIndexes(indexEntities)) { if (deleteTransferIndexes(indexEntities)) {
log.logDebug("Deleted all transferred whole-word indexes locally"); log.logDebug("Deleted all " + indexEntities.length + " transferred whole-word indexes locally");
return indexCount; return indexCount;
} else { } else {
log.logError("Deleted not all transferred whole-word indexes"); log.logError("Deleted not all transferred whole-word indexes");
@ -200,14 +207,19 @@ public class plasmaWordIndexDistribution {
} }
} }
private plasmaWordIndexEntity[] selectTransferIndexes(String hash, int count) { private Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/
selectTransferIndexes(String hash, int count) {
Vector tmpEntities = new Vector(); Vector tmpEntities = new Vector();
String nexthash = ""; String nexthash = "";
try { try {
Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true); Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true);
plasmaWordIndexEntity indexEntity, tmpEntity; plasmaWordIndexEntity indexEntity, tmpEntity;
Enumeration urlEnum; Enumeration urlEnum;
Iterator hashIter;
plasmaWordIndexEntry indexEntry; plasmaWordIndexEntry indexEntry;
plasmaCrawlLURL.Entry lurl;
HashSet unknownURLEntries;
HashMap knownURLs = new HashMap();
while ((count > 0) && (wordHashIterator.hasNext()) && while ((count > 0) && (wordHashIterator.hasNext()) &&
((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) {
indexEntity = wordIndex.getEntity(nexthash, true); indexEntity = wordIndex.getEntity(nexthash, true);
@ -215,20 +227,60 @@ public class plasmaWordIndexDistribution {
indexEntity.deleteComplete(); indexEntity.deleteComplete();
} else if (indexEntity.size() <= count) { } else if (indexEntity.size() <= count) {
// take the whole entity // take the whole entity
// fist check if we know all urls
urlEnum = indexEntity.elements(true);
unknownURLEntries = new HashSet();
while (urlEnum.hasMoreElements()) {
indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement();
lurl = urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if ((lurl == null) || (lurl.toString() == null)) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) {
urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
}
}
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexEntity.removeEntry((String) hashIter.next(), false);
}
// use whats remaining
tmpEntities.add(indexEntity); tmpEntities.add(indexEntity);
log.logDebug("Selected whole index (" + indexEntity.size() + " URLs) for word " + indexEntity.wordHash()); log.logDebug("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash());
count -= indexEntity.size(); count -= indexEntity.size();
} else { } else {
// make an on-the-fly entity and insert values // make an on-the-fly entity and insert values
tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash());
urlEnum = indexEntity.elements(true); urlEnum = indexEntity.elements(true);
unknownURLEntries = new HashSet();
while ((urlEnum.hasMoreElements()) && (count > 0)) { while ((urlEnum.hasMoreElements()) && (count > 0)) {
indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement();
lurl = urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl == null) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) {
urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
tmpEntity.addEntry(indexEntry); tmpEntity.addEntry(indexEntry);
count--; count--;
} }
urlEnum = null; }
log.logDebug("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs) for word " + tmpEntity.wordHash()); }
// now delete all entries that have no url entry
hashIter = unknownURLEntries.iterator();
while (hashIter.hasNext()) {
indexEntity.removeEntry((String) hashIter.next(), true);
}
// use whats remaining
log.logDebug("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash());
tmpEntities.add(tmpEntity); tmpEntities.add(tmpEntity);
indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards
indexEntity = null; indexEntity = null;
@ -238,15 +290,15 @@ public class plasmaWordIndexDistribution {
// transfer to array // transfer to array
plasmaWordIndexEntity[] indexEntities = new plasmaWordIndexEntity[tmpEntities.size()]; plasmaWordIndexEntity[] indexEntities = new plasmaWordIndexEntity[tmpEntities.size()];
for (int i = 0; i < tmpEntities.size(); i++) indexEntities[i] = (plasmaWordIndexEntity) tmpEntities.elementAt(i); for (int i = 0; i < tmpEntities.size(); i++) indexEntities[i] = (plasmaWordIndexEntity) tmpEntities.elementAt(i);
return indexEntities; return new Object[]{indexEntities, knownURLs};
} catch (IOException e) { } catch (IOException e) {
log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage());
e.printStackTrace(); e.printStackTrace();
return new plasmaWordIndexEntity[0]; return new Object[]{new plasmaWordIndexEntity[0], new HashMap()};
} catch (kelondroException e) { } catch (kelondroException e) {
log.logError("selectTransferIndexes database corrupted: " + e.getMessage()); log.logError("selectTransferIndexes database corrupted: " + e.getMessage());
e.printStackTrace(); e.printStackTrace();
return new plasmaWordIndexEntity[0]; return new Object[]{new plasmaWordIndexEntity[0], new HashMap()};
} }
} }

@ -451,38 +451,6 @@ public class yacyClient {
} }
} }
/*
public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer, int depth) {
// this post a message to the remote message board
if (targetSeed == null) return null;
if (yacyCore.seedDB.mySeed == null) return null;
if (yacyCore.seedDB.mySeed == targetSeed) return null;
// construct request
String key = crypt.randomSalt();
String address = targetSeed.getAddress();
if (address == null) return null;
try {
return nxTools.table(httpc.wget(
new URL("http://" + address + "/yacy/crawlOrder.html?"+
"key=" + key +
"&process=crawl" +
"&youare=" + targetSeed.hash +
"&iam=" + yacyCore.seedDB.mySeed.hash +
"&url=" + crypt.simpleEncode(url.toString()) +
"&referrer=" + crypt.simpleEncode((referrer == null) ? "" : referrer.toString()) +
"&depth=" + depth +
"&ttl=0"
),
10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort));
} catch (Exception e) {
// most probably a network time-out exception
yacyCore.log.logError("yacyClient.crawlOrder error: peer=" + targetSeed.getName() + ", error=" + e.getMessage());
return null;
}
}
*/
public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer) { public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer) {
// this post a message to the remote message board // this post a message to the remote message board
if (targetSeed == null) return null; if (targetSeed == null) return null;
@ -578,8 +546,8 @@ public class yacyClient {
httpHeader requestHeader) throws IOException { httpHeader requestHeader) throws IOException {
*/ */
public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, plasmaCrawlLURL urlDB) { public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache) {
HashMap in = transferRWI(targetSeed, indexes, urlDB); HashMap in = transferRWI(targetSeed, indexes);
if (in == null) return "no_connection_1"; if (in == null) return "no_connection_1";
String result = (String) in.get("result"); String result = (String) in.get("result");
if (result == null) return "no_result_1"; if (result == null) return "no_result_1";
@ -592,7 +560,6 @@ public class yacyClient {
//System.out.println("DEBUG yacyClient.transferIndex: " + uhs.length + " urls unknown"); //System.out.println("DEBUG yacyClient.transferIndex: " + uhs.length + " urls unknown");
if (uhs.length == 0) return null; // all url's known if (uhs.length == 0) return null; // all url's known
// extract the urlCache from the result // extract the urlCache from the result
HashMap urlCache = (HashMap) in.get("$URLCACHE$");
plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length];
for (int i = 0; i < uhs.length; i++) { for (int i = 0; i < uhs.length; i++) {
urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]);
@ -608,7 +575,7 @@ public class yacyClient {
return null; return null;
} }
private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, plasmaCrawlLURL urlDB) { private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes) {
String address = targetSeed.getAddress(); String address = targetSeed.getAddress();
if (address == null) return null; if (address == null) return null;
// prepare post values // prepare post values
@ -622,48 +589,21 @@ public class yacyClient {
String entrypost = ""; String entrypost = "";
Enumeration eenum; Enumeration eenum;
plasmaWordIndexEntry entry; plasmaWordIndexEntry entry;
HashMap urlCache = new HashMap();
plasmaCrawlLURL.Entry urlentry;
HashSet unknownURLs = new HashSet();
for (int i = 0; i < indexes.length; i++) { for (int i = 0; i < indexes.length; i++) {
eenum = indexes[i].elements(true); eenum = indexes[i].elements(true);
while (eenum.hasMoreElements()) { while (eenum.hasMoreElements()) {
entry = (plasmaWordIndexEntry) eenum.nextElement(); entry = (plasmaWordIndexEntry) eenum.nextElement();
// check if an LURL-Entry exists
if (urlCache.containsKey(entry.getUrlHash())) {
// easy case: the url is known and in the cache
entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString;
indexcount++;
} else if (unknownURLs.contains(entry.getUrlHash())) {
// in this case, we do nothing
} else {
// try to get the entry from the urlDB
if ((urlDB.exists(entry.getUrlHash())) &&
((urlentry = urlDB.getEntry(entry.getUrlHash())) != null)) {
// good case: store the urlentry to the cache
urlCache.put(entry.getUrlHash(), urlentry);
// add index to list
entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString; entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString;
indexcount++; indexcount++;
} else {
// this is bad: the url is unknown. We put the link to a set and delete then later
unknownURLs.add(entry.getUrlHash());
}
}
} }
} }
// we loop again and delete all links where the url is unknown if (indexcount == 0) {
Iterator it; // nothing to do but everything ok
String urlhash; HashMap result = new HashMap();
for (int i = 0; i < indexes.length; i++) { result.put("result", "ok");
it = unknownURLs.iterator(); result.put("unknownURL", "");
while (it.hasNext()) { return result;
urlhash = (String) it.next();
try {
if (indexes[i].contains(urlhash)) indexes[i].removeEntry(urlhash, true);
} catch (IOException e) {}
}
} }
post.put("entryc", Integer.toString(indexcount)); post.put("entryc", Integer.toString(indexcount));
@ -677,8 +617,6 @@ public class yacyClient {
} }
HashMap result = nxTools.table(v); HashMap result = nxTools.table(v);
result.put("$URLCACHE$", urlCache);
result.put("$UNKNOWNC$", Integer.toString(unknownURLs.size()));
return result; return result;
} catch (Exception e) { } catch (Exception e) {
yacyCore.log.logError("yacyClient.transferRWI error:" + e.getMessage()); yacyCore.log.logError("yacyClient.transferRWI error:" + e.getMessage());

Loading…
Cancel
Save