apply blacklist on rwis during dht receive

very experimental!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1865 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 915812f597
commit f188611fc6

@ -126,19 +126,18 @@ public final class transferRWI {
wordhashes[received] = wordHash; wordhashes[received] = wordHash;
entry = new plasmaWordIndexEntry(estring.substring(p)); entry = new plasmaWordIndexEntry(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true); sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
//sb.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), System.currentTimeMillis(), true);
serverCore.checkInterruption(); serverCore.checkInterruption();
urlHash = entry.getUrlHash(); urlHash = entry.getUrlHash();
try { try {
if ( if ((!(unknownURL.contains(urlHash))) &&
(!(unknownURL.contains(urlHash))) && (!(sb.urlPool.loadedURL.exists(urlHash)))) {
(!(sb.urlPool.loadedURL.exists(urlHash)))
) {
unknownURL.add(urlHash); unknownURL.add(urlHash);
} }
} catch (Exception ex) { } catch (Exception ex) {
sb.getLog().logWarning("transferRWI: DB-Error while trying to determine if URL with hash '" + urlHash + "' is known.",ex); sb.getLog().logWarning(
"transferRWI: DB-Error while trying to determine if URL with hash '" +
urlHash + "' is known.", ex);
unknownURL.add(urlHash); unknownURL.add(urlHash);
} }
received++; received++;

@ -95,15 +95,16 @@ public final class transferURL {
} else { } else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true); lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (lEntry.url() != null)) { if ((lEntry != null) && (lEntry.url() != null)) {
if ( if ((blockBlacklist) &&
(blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) {
(plasmaSwitchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath())) int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
){ yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName);
lEntry = null; lEntry = null;
} else { } else {
sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3); sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName); yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer "
+ otherPeerName);
received++; received++;
} }
} else { } else {

@ -358,6 +358,14 @@ public final class plasmaWordIndex {
return removed; return removed;
} }
public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// and can be found in the RAM cache
// this returns the number of deletion that had been possible
return ramCache.tryRemoveURLs(urlHash);
}
public static final int RL_RAMCACHE = 0; public static final int RL_RAMCACHE = 0;
public static final int RL_FILECACHE = 1; public static final int RL_FILECACHE = 1;
public static final int RL_ASSORTMENTS = 2; public static final int RL_ASSORTMENTS = 2;

@ -61,7 +61,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// environment constants // environment constants
private static final String indexArrayFileName = "indexDump1.array"; private static final String indexArrayFileName = "indexDump1.array";
public static final int ramCacheReferenceLimit = 50; public static final int ramCacheReferenceLimit = 50;
public static final long ramCacheAgeLimit = 60 * 60 * 2 * 1000; // milliseconds; 2 Hours public static final long ramCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 Hours
public static final long ramCacheMinAge = 1000 * 60 * 2; // milliseconds; 2 Minutes (Karenz for DHT Receive)
// class variables // class variables
private final File databaseRoot; private final File databaseRoot;
@ -257,12 +258,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
String hash = null; String hash = null;
int count = hashScore.getMaxScore(); int count = hashScore.getMaxScore();
if ((count > ramCacheReferenceLimit) && if ((count > ramCacheReferenceLimit) &&
((hash = (String) hashScore.getMaxObject()) != null)) { ((hash = (String) hashScore.getMaxObject()) != null) &&
// flush high-score entries (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > ramCacheMinAge)) {
// flush high-score entries, but not if they are too 'young'
return hash; return hash;
} }
long oldestTime = longEmit(hashDate.getMinScore()); long oldestTime = longEmit(hashDate.getMinScore());
if (((System.currentTimeMillis() - oldestTime) > ramCacheAgeLimit) && if (((System.currentTimeMillis() - oldestTime) > ramCacheMaxAge) &&
((hash = (String) hashDate.getMinObject()) != null)) { ((hash = (String) hashDate.getMinObject()) != null)) {
// flush out-dated entries // flush out-dated entries
return hash; return hash;
@ -271,6 +273,10 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if (Runtime.getRuntime().freeMemory() < 10000000) { if (Runtime.getRuntime().freeMemory() < 10000000) {
// low-memory case // low-memory case
hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM) hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM)
if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < ramCacheMinAge) {
// to young, take it from the oldest entries
hash = (String) hashDate.getMinObject();
}
} else { } else {
// not-efficient-so-far case // not-efficient-so-far case
hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster) hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster)
@ -335,6 +341,30 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return count; return count;
} }
public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// Such entries must be searched in the latest entries
Iterator i = hashDate.scores(false);
String wordHash;
long t;
plasmaWordIndexEntryContainer c;
int delCount = 0;
while (i.hasNext()) {
wordHash = (String) i.next();
// check time
t = longEmit(hashDate.getScore(wordHash));
if (System.currentTimeMillis() - t > ramCacheMinAge) return delCount;
// get container
c = (plasmaWordIndexEntryContainer) cache.get(wordHash);
if (c.remove(urlHash) != null) {
cache.put(wordHash, c);
delCount++;
}
}
return delCount;
}
public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) { public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) {
// this puts the entries into the cache, not into the assortment directly // this puts the entries into the cache, not into the assortment directly

Loading…
Cancel
Save