diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 6c3bc806a..3184049bf 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -48,6 +48,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.Enumeration; import java.util.HashSet; +import java.util.HashMap; import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -206,7 +207,33 @@ public class IndexControl_p { String result; long starttime = System.currentTimeMillis(); indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); - result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.urlPool.loadedURL); + // built urlCache + Enumeration urlEnum = indexes[0].elements(true); + HashMap knownURLs = new HashMap(); + HashSet unknownURLEntries = new HashSet(); + plasmaWordIndexEntry indexEntry; + plasmaCrawlLURL.Entry lurl; + while (urlEnum.hasMoreElements()) { + indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); + lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + if (lurl == null) { + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + if (lurl.toString() == null) { + switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + knownURLs.put(indexEntry.getUrlHash(), lurl); + } + } + } + // now delete all entries that have no url entry + Iterator hashIter = unknownURLEntries.iterator(); + while (hashIter.hasNext()) try { + indexes[0].removeEntry((String) hashIter.next(), false); + } catch (IOException e) {} + // use whats remaining + result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, knownURLs); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); try {indexes[0].close();} catch (IOException e) {} } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index f67f3f679..f232a36c8 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -537,7 +537,9 @@ public class plasmaCrawlLURL extends plasmaURL { ",url=" + crypt.simpleEncode(url.toString()) + ",descr=" + crypt.simpleEncode(descr); } catch (Exception e) { - serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); + //serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); + //if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); + //if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); //e.printStackTrace(); return null; } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 881c91766..87dac0c31 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -382,7 +382,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser indexDistribution = new plasmaWordIndexDistribution(urlPool, wordIndex, log, getConfig("allowDistributeIndex", "false").equals("true"), getConfig("allowDistributeIndexWhileCrawling","false").equals("true")); - indexDistribution.setCounts(100, 1, 3, 8000); + indexDistribution.setCounts(150, 1, 3, 10000); deployThread("20_dhtdistribution", "DHT Distribution", "selection, transfer and deletion of index entries that are not searched on your peer, but on others", null, new serverInstantThread(indexDistribution, "job", null), 12000); diff --git a/source/de/anomic/plasma/plasmaWordIndexDistribution.java b/source/de/anomic/plasma/plasmaWordIndexDistribution.java index 90188b37f..ed7bb515b 100644 --- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java +++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java @@ -6,6 +6,8 @@ import java.io.IOException; import java.util.Enumeration; import java.util.Vector; import java.util.Iterator; +import java.util.HashSet; +import java.util.HashMap; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacySeed; @@ -133,13 +135,18 @@ public class plasmaWordIndexDistribution { // collect index String startPointHash = yacyCore.seedDB.mySeed.hash; //String startPointHash = serverCodings.encodeMD5B64("" + System.currentTimeMillis(), true).substring(0, yacySeedDB.commonHashLength); - plasmaWordIndexEntity[] indexEntities = selectTransferIndexes(startPointHash, indexCount); + Object[] selectResult = selectTransferIndexes(startPointHash, indexCount); + plasmaWordIndexEntity[] indexEntities = (plasmaWordIndexEntity[]) selectResult[0]; + HashMap urlCache = (HashMap) selectResult[1]; // String (url-hash) / plasmaCrawlLURL.Entry if ((indexEntities == null) || (indexEntities.length == 0)) { log.logDebug("No index available for index transfer, hash start-point " + startPointHash); return -1; } // count the indexes again, can be smaller as expected - indexCount = 0; for (int i = 0; i < indexEntities.length; i++) indexCount += indexEntities[i].size(); + indexCount = 0; + for (int i = 0; i < indexEntities.length; i++) { + indexCount += indexEntities[i].size(); + } // find start point for DHT-selection String keyhash = indexEntities[indexEntities.length - 1].wordHash(); // DHT targets must have greater hashes @@ -157,7 +164,7 @@ public class plasmaWordIndexDistribution { } seed = (yacySeed) e.nextElement(); if (seed != null) { - error = yacyClient.transferIndex(seed, indexEntities, urlPool.loadedURL); + error = yacyClient.transferIndex(seed, indexEntities, urlCache); if (error == null) { log.logInfo("Index transfer of " + indexCount + " words [" + indexEntities[0].wordHash() + " .. " + indexEntities[indexEntities.length-1].wordHash() + "] to peer " + seed.getName() + ":" + seed.hash + " successfull"); peerNames += ", " + seed.getName(); @@ -176,7 +183,7 @@ public class plasmaWordIndexDistribution { if (delete) { try { if (deleteTransferIndexes(indexEntities)) { - log.logDebug("Deleted all transferred whole-word indexes locally"); + log.logDebug("Deleted all " + indexEntities.length + " transferred whole-word indexes locally"); return indexCount; } else { log.logError("Deleted not all transferred whole-word indexes"); @@ -200,14 +207,19 @@ public class plasmaWordIndexDistribution { } } - private plasmaWordIndexEntity[] selectTransferIndexes(String hash, int count) { + private Object[] /* of {plasmaWordIndexEntity[], HashMap(String, plasmaCrawlLURL.Entry)}*/ + selectTransferIndexes(String hash, int count) { Vector tmpEntities = new Vector(); String nexthash = ""; try { Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true); plasmaWordIndexEntity indexEntity, tmpEntity; Enumeration urlEnum; + Iterator hashIter; plasmaWordIndexEntry indexEntry; + plasmaCrawlLURL.Entry lurl; + HashSet unknownURLEntries; + HashMap knownURLs = new HashMap(); while ((count > 0) && (wordHashIterator.hasNext()) && ((nexthash = (String) wordHashIterator.next()) != null) && (nexthash.trim().length() > 0)) { indexEntity = wordIndex.getEntity(nexthash, true); @@ -215,20 +227,60 @@ public class plasmaWordIndexDistribution { indexEntity.deleteComplete(); } else if (indexEntity.size() <= count) { // take the whole entity + // fist check if we know all urls + urlEnum = indexEntity.elements(true); + unknownURLEntries = new HashSet(); + while (urlEnum.hasMoreElements()) { + indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); + lurl = urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + if ((lurl == null) || (lurl.toString() == null)) { + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + if (lurl.toString() == null) { + urlPool.loadedURL.remove(indexEntry.getUrlHash()); + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + knownURLs.put(indexEntry.getUrlHash(), lurl); + } + } + } + // now delete all entries that have no url entry + hashIter = unknownURLEntries.iterator(); + while (hashIter.hasNext()) { + indexEntity.removeEntry((String) hashIter.next(), false); + } + // use whats remaining tmpEntities.add(indexEntity); - log.logDebug("Selected whole index (" + indexEntity.size() + " URLs) for word " + indexEntity.wordHash()); + log.logDebug("Selected whole index (" + indexEntity.size() + " URLs, " + unknownURLEntries.size() + " not bound) for word " + indexEntity.wordHash()); count -= indexEntity.size(); } else { // make an on-the-fly entity and insert values tmpEntity = new plasmaWordIndexEntity(indexEntity.wordHash()); urlEnum = indexEntity.elements(true); + unknownURLEntries = new HashSet(); while ((urlEnum.hasMoreElements()) && (count > 0)) { indexEntry = (plasmaWordIndexEntry) urlEnum.nextElement(); - tmpEntity.addEntry(indexEntry); - count--; + lurl = urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); + if (lurl == null) { + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + if (lurl.toString() == null) { + urlPool.loadedURL.remove(indexEntry.getUrlHash()); + unknownURLEntries.add(indexEntry.getUrlHash()); + } else { + knownURLs.put(indexEntry.getUrlHash(), lurl); + tmpEntity.addEntry(indexEntry); + count--; + } + } + } + // now delete all entries that have no url entry + hashIter = unknownURLEntries.iterator(); + while (hashIter.hasNext()) { + indexEntity.removeEntry((String) hashIter.next(), true); } - urlEnum = null; - log.logDebug("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs) for word " + tmpEntity.wordHash()); + // use whats remaining + log.logDebug("Selected partial index (" + tmpEntity.size() + " from " + indexEntity.size() +" URLs, " + unknownURLEntries.size() + " not bound) for word " + tmpEntity.wordHash()); tmpEntities.add(tmpEntity); indexEntity.close(); // important: is not closed elswhere and cannot be deleted afterwards indexEntity = null; @@ -238,15 +290,15 @@ public class plasmaWordIndexDistribution { // transfer to array plasmaWordIndexEntity[] indexEntities = new plasmaWordIndexEntity[tmpEntities.size()]; for (int i = 0; i < tmpEntities.size(); i++) indexEntities[i] = (plasmaWordIndexEntity) tmpEntities.elementAt(i); - return indexEntities; + return new Object[]{indexEntities, knownURLs}; } catch (IOException e) { log.logError("selectTransferIndexes IO-Error (hash=" + nexthash + "): " + e.getMessage()); e.printStackTrace(); - return new plasmaWordIndexEntity[0]; + return new Object[]{new plasmaWordIndexEntity[0], new HashMap()}; } catch (kelondroException e) { log.logError("selectTransferIndexes database corrupted: " + e.getMessage()); e.printStackTrace(); - return new plasmaWordIndexEntity[0]; + return new Object[]{new plasmaWordIndexEntity[0], new HashMap()}; } } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 6fc172c90..1876c2d12 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -451,38 +451,6 @@ public class yacyClient { } } - /* - public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer, int depth) { - // this post a message to the remote message board - if (targetSeed == null) return null; - if (yacyCore.seedDB.mySeed == null) return null; - if (yacyCore.seedDB.mySeed == targetSeed) return null; - - // construct request - String key = crypt.randomSalt(); - String address = targetSeed.getAddress(); - if (address == null) return null; - try { - return nxTools.table(httpc.wget( - new URL("http://" + address + "/yacy/crawlOrder.html?"+ - "key=" + key + - "&process=crawl" + - "&youare=" + targetSeed.hash + - "&iam=" + yacyCore.seedDB.mySeed.hash + - "&url=" + crypt.simpleEncode(url.toString()) + - "&referrer=" + crypt.simpleEncode((referrer == null) ? "" : referrer.toString()) + - "&depth=" + depth + - "&ttl=0" - ), - 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); - } catch (Exception e) { - // most probably a network time-out exception - yacyCore.log.logError("yacyClient.crawlOrder error: peer=" + targetSeed.getName() + ", error=" + e.getMessage()); - return null; - } - } - */ - public static HashMap crawlOrder(yacySeed targetSeed, URL url, URL referrer) { // this post a message to the remote message board if (targetSeed == null) return null; @@ -578,8 +546,8 @@ public class yacyClient { httpHeader requestHeader) throws IOException { */ - public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, plasmaCrawlLURL urlDB) { - HashMap in = transferRWI(targetSeed, indexes, urlDB); + public static String transferIndex(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, HashMap urlCache) { + HashMap in = transferRWI(targetSeed, indexes); if (in == null) return "no_connection_1"; String result = (String) in.get("result"); if (result == null) return "no_result_1"; @@ -592,7 +560,6 @@ public class yacyClient { //System.out.println("DEBUG yacyClient.transferIndex: " + uhs.length + " urls unknown"); if (uhs.length == 0) return null; // all url's known // extract the urlCache from the result - HashMap urlCache = (HashMap) in.get("$URLCACHE$"); plasmaCrawlLURL.Entry[] urls = new plasmaCrawlLURL.Entry[uhs.length]; for (int i = 0; i < uhs.length; i++) { urls[i] = (plasmaCrawlLURL.Entry) urlCache.get(uhs[i]); @@ -608,7 +575,7 @@ public class yacyClient { return null; } - private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes, plasmaCrawlLURL urlDB) { + private static HashMap transferRWI(yacySeed targetSeed, plasmaWordIndexEntity[] indexes) { String address = targetSeed.getAddress(); if (address == null) return null; // prepare post values @@ -622,48 +589,21 @@ public class yacyClient { String entrypost = ""; Enumeration eenum; plasmaWordIndexEntry entry; - HashMap urlCache = new HashMap(); - plasmaCrawlLURL.Entry urlentry; - HashSet unknownURLs = new HashSet(); for (int i = 0; i < indexes.length; i++) { eenum = indexes[i].elements(true); while (eenum.hasMoreElements()) { entry = (plasmaWordIndexEntry) eenum.nextElement(); - // check if an LURL-Entry exists - if (urlCache.containsKey(entry.getUrlHash())) { - // easy case: the url is known and in the cache - entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString; - indexcount++; - } else if (unknownURLs.contains(entry.getUrlHash())) { - // in this case, we do nothing - } else { - // try to get the entry from the urlDB - if ((urlDB.exists(entry.getUrlHash())) && - ((urlentry = urlDB.getEntry(entry.getUrlHash())) != null)) { - // good case: store the urlentry to the cache - urlCache.put(entry.getUrlHash(), urlentry); - // add index to list - entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString; - indexcount++; - } else { - // this is bad: the url is unknown. We put the link to a set and delete then later - unknownURLs.add(entry.getUrlHash()); - } - } + entrypost += indexes[i].wordHash() + entry.toExternalForm() + serverCore.crlfString; + indexcount++; } } - // we loop again and delete all links where the url is unknown - Iterator it; - String urlhash; - for (int i = 0; i < indexes.length; i++) { - it = unknownURLs.iterator(); - while (it.hasNext()) { - urlhash = (String) it.next(); - try { - if (indexes[i].contains(urlhash)) indexes[i].removeEntry(urlhash, true); - } catch (IOException e) {} - } + if (indexcount == 0) { + // nothing to do but everything ok + HashMap result = new HashMap(); + result.put("result", "ok"); + result.put("unknownURL", ""); + return result; } post.put("entryc", Integer.toString(indexcount)); @@ -677,8 +617,6 @@ public class yacyClient { } HashMap result = nxTools.table(v); - result.put("$URLCACHE$", urlCache); - result.put("$UNKNOWNC$", Integer.toString(unknownURLs.size())); return result; } catch (Exception e) { yacyCore.log.logError("yacyClient.transferRWI error:" + e.getMessage());