From d56f06401e30c0550c6e4f39d1e6a26b20843fda Mon Sep 17 00:00:00 2001 From: hermens Date: Mon, 7 Aug 2006 11:42:00 +0000 Subject: [PATCH] - Cache known URLs during indexReceive to avoid getting blocked during loadedURL.exists() whenever possible - Small logging updates git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2359 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/yacy/transferRWI.java | 45 ++++++++++--------- htroot/yacy/transferURL.java | 4 +- .../de/anomic/plasma/plasmaDHTTransfer.java | 2 + 3 files changed, 30 insertions(+), 21 deletions(-) diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index a60db4c8e..576c154bb 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -136,8 +136,11 @@ public final class transferRWI { indexEntry iEntry; int wordhashesSize = v.size(); final HashSet unknownURL = new HashSet(); + final HashSet knownURL = new HashSet(); String[] wordhashes = new String[v.size()]; int received = 0; + int blocked = 0; + int receivedURL = 0; for (int i = 0; i < wordhashesSize; i++) { serverCore.checkInterruption(); @@ -147,29 +150,31 @@ public final class transferRWI { wordHash = estring.substring(0, p); wordhashes[received] = wordHash; iEntry = new indexURLEntry(estring.substring(p)); - sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true); - serverCore.checkInterruption(); - urlHash = iEntry.urlHash(); - try { - if ((!(unknownURL.contains(urlHash))) && - (!(sb.urlPool.loadedURL.exists(urlHash)))) { - if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(urlHash))) { - int deleted = sb.wordIndex.tryRemoveURLs(urlHash); - yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); - //TODO: set to logFine if it works. - } - else { - unknownURL.add(urlHash); + if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.hashInBlacklistedCache(urlHash))) { + //int deleted = sb.wordIndex.tryRemoveURLs(urlHash); + yacyCore.log.logFine("transferRWI: blocked blacklisted URLHash '" + urlHash + "' from peer " + otherPeerName + "; deleted 1 URL entries from RWIs"); + blocked++; + } else { + sb.wordIndex.addEntry(wordHash, iEntry, System.currentTimeMillis(), true); + serverCore.checkInterruption(); + + if (!(knownURL.contains(urlHash)||unknownURL.contains(urlHash))) { + try { + if (sb.urlPool.loadedURL.exists(urlHash)) { + knownURL.add(urlHash); + } else { + unknownURL.add(urlHash); + } + } catch (Exception ex) { + sb.getLog().logWarning( + "transferRWI: DB-Error while trying to determine if URL with hash '" + + urlHash + "' is known.", ex); } + receivedURL++; } - } catch (Exception ex) { - sb.getLog().logWarning( - "transferRWI: DB-Error while trying to determine if URL with hash '" + - urlHash + "' is known.", ex); - unknownURL.add(urlHash); + received++; } - received++; } } yacyCore.seedDB.mySeed.incRI(received); @@ -185,7 +190,7 @@ public final class transferRWI { sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs"); } else { final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0; - sb.getLog().logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs"); + sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs"); } result = "ok"; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 6ab05512a..9661f5529 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -83,6 +83,7 @@ public final class transferURL { if (granted) { int received = 0; + int blocked = 0; final int sizeBefore = sb.urlPool.loadedURL.size(); // read the urls from the other properties and store String urls; @@ -100,6 +101,7 @@ public final class transferURL { int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); lEntry = null; + blocked++; } else { lEntry.store(); sb.urlPool.loadedURL.stackEntry(lEntry, iam, iam, 3); @@ -121,7 +123,7 @@ public final class transferURL { // return rewrite properties final int more = sb.urlPool.loadedURL.size() - sizeBefore; doublevalues = Integer.toString(received - more); - sb.getLog().logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms."); + sb.getLog().logInfo("Received " + received + " URLs from peer " + otherPeerName + " in " + (System.currentTimeMillis() - start) + " ms, Blocked " + blocked + " URLs"); if ((received - more) > 0) sb.getLog().logSevere("Received " + doublevalues + " double URLs from peer " + otherPeerName); result = "ok"; } else { diff --git a/source/de/anomic/plasma/plasmaDHTTransfer.java b/source/de/anomic/plasma/plasmaDHTTransfer.java index c1364678b..46d41b3de 100644 --- a/source/de/anomic/plasma/plasmaDHTTransfer.java +++ b/source/de/anomic/plasma/plasmaDHTTransfer.java @@ -160,7 +160,9 @@ public class plasmaDHTTransfer extends Thread { this.payloadSize = ((Integer)result.get("payloadSize")).intValue(); this.log.logInfo("Index transfer of " + this.dhtChunk.indexCount() + + " entries " + this.dhtChunk.containerSize() + " words [" + this.dhtChunk.firstContainer().getWordHash() + " .. " + this.dhtChunk.lastContainer().getWordHash() + "]" + + " and " + this.dhtChunk.urlCacheMap().size() + " URLs" + " to peer " + this.seed.getName() + ":" + this.seed.hash + " in " + (this.transferTime / 1000) + " seconds successful (" + (1000 * this.dhtChunk.indexCount() / (this.transferTime + 1)) +