apply blacklist on rwis during dht receive

very experimental!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1865 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 915812f597
commit f188611fc6

@ -126,19 +126,18 @@ public final class transferRWI {
wordhashes[received] = wordHash;
entry = new plasmaWordIndexEntry(estring.substring(p));
sb.wordIndex.addEntry(wordHash, entry, System.currentTimeMillis(), true);
//sb.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), System.currentTimeMillis(), true);
serverCore.checkInterruption();
urlHash = entry.getUrlHash();
try {
if (
(!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))
) {
if ((!(unknownURL.contains(urlHash))) &&
(!(sb.urlPool.loadedURL.exists(urlHash)))) {
unknownURL.add(urlHash);
}
} catch (Exception ex) {
sb.getLog().logWarning("transferRWI: DB-Error while trying to determine if URL with hash '" + urlHash + "' is known.",ex);
sb.getLog().logWarning(
"transferRWI: DB-Error while trying to determine if URL with hash '" +
urlHash + "' is known.", ex);
unknownURL.add(urlHash);
}
received++;

@ -95,15 +95,16 @@ public final class transferURL {
} else {
lEntry = sb.urlPool.loadedURL.newEntry(urls, true);
if ((lEntry != null) && (lEntry.url() != null)) {
if (
(blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed(lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))
){
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName);
if ((blockBlacklist) &&
(plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null;
} else {
sb.urlPool.loadedURL.addEntry(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + lEntry.url() + "' from peer " + otherPeerName);
yacyCore.log.logFine("transferURL: received URL '"
+ lEntry.url() + "' from peer "
+ otherPeerName);
received++;
}
} else {

@ -358,6 +358,14 @@ public final class plasmaWordIndex {
return removed;
}
public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// and can be found in the RAM cache
// this returns the number of deletion that had been possible
return ramCache.tryRemoveURLs(urlHash);
}
public static final int RL_RAMCACHE = 0;
public static final int RL_FILECACHE = 1;
public static final int RL_ASSORTMENTS = 2;

@ -61,7 +61,8 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
// environment constants
private static final String indexArrayFileName = "indexDump1.array";
public static final int ramCacheReferenceLimit = 50;
public static final long ramCacheAgeLimit = 60 * 60 * 2 * 1000; // milliseconds; 2 Hours
public static final long ramCacheMaxAge = 1000 * 60 * 60 * 2; // milliseconds; 2 Hours
public static final long ramCacheMinAge = 1000 * 60 * 2; // milliseconds; 2 Minutes (Karenz for DHT Receive)
// class variables
private final File databaseRoot;
@ -257,12 +258,13 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
String hash = null;
int count = hashScore.getMaxScore();
if ((count > ramCacheReferenceLimit) &&
((hash = (String) hashScore.getMaxObject()) != null)) {
// flush high-score entries
((hash = (String) hashScore.getMaxObject()) != null) &&
(System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) > ramCacheMinAge)) {
// flush high-score entries, but not if they are too 'young'
return hash;
}
long oldestTime = longEmit(hashDate.getMinScore());
if (((System.currentTimeMillis() - oldestTime) > ramCacheAgeLimit) &&
if (((System.currentTimeMillis() - oldestTime) > ramCacheMaxAge) &&
((hash = (String) hashDate.getMinObject()) != null)) {
// flush out-dated entries
return hash;
@ -271,6 +273,10 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
if (Runtime.getRuntime().freeMemory() < 10000000) {
// low-memory case
hash = (String) hashScore.getMaxObject(); // flush high-score entries (saves RAM)
if (System.currentTimeMillis() - longEmit(hashDate.getScore(hash)) < ramCacheMinAge) {
// to young, take it from the oldest entries
hash = (String) hashDate.getMinObject();
}
} else {
// not-efficient-so-far case
hash = (String) hashDate.getMinObject(); // flush oldest entries (makes indexing faster)
@ -335,6 +341,30 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
return count;
}
public synchronized int tryRemoveURLs(String urlHash) {
// this tries to delete an index from the cache that has this
// urlHash assigned. This can only work if the entry is really fresh
// Such entries must be searched in the latest entries
Iterator i = hashDate.scores(false);
String wordHash;
long t;
plasmaWordIndexEntryContainer c;
int delCount = 0;
while (i.hasNext()) {
wordHash = (String) i.next();
// check time
t = longEmit(hashDate.getScore(wordHash));
if (System.currentTimeMillis() - t > ramCacheMinAge) return delCount;
// get container
c = (plasmaWordIndexEntryContainer) cache.get(wordHash);
if (c.remove(urlHash) != null) {
cache.put(wordHash, c);
delCount++;
}
}
return delCount;
}
public int addEntries(plasmaWordIndexEntryContainer container, long updateTime, boolean highPriority) {
// this puts the entries into the cache, not into the assortment directly

Loading…
Cancel
Save