|
|
|
@ -388,8 +388,9 @@ public final class plasmaCrawlLURL {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public Object next() throws RuntimeException {
|
|
|
|
|
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
|
|
|
|
|
if (e == null) return null;
|
|
|
|
|
kelondroRow.Entry e = null;
|
|
|
|
|
if (i.hasNext()) { e = (kelondroRow.Entry) i.next(); }
|
|
|
|
|
if (e == null) { return null; }
|
|
|
|
|
return new indexURLEntryNew(e, null);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -499,7 +500,7 @@ public final class plasmaCrawlLURL {
|
|
|
|
|
public void run() {
|
|
|
|
|
try {
|
|
|
|
|
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
|
|
|
|
|
Iterator eiter = entries(true, false, null);
|
|
|
|
|
final Iterator eiter = entries(true, false, null);
|
|
|
|
|
while (eiter.hasNext() && run) {
|
|
|
|
|
synchronized (this) {
|
|
|
|
|
if (this.pause) {
|
|
|
|
@ -512,19 +513,17 @@ public final class plasmaCrawlLURL {
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (eiter.hasNext()) {
|
|
|
|
|
indexURLEntry entry = (indexURLEntry) eiter.next();
|
|
|
|
|
final indexURLEntry entry = (indexURLEntry) eiter.next();
|
|
|
|
|
if (entry == null) {
|
|
|
|
|
serverLog.logFine("URLDBCLEANER", "entry == null");
|
|
|
|
|
} else if (entry.hash() == null) {
|
|
|
|
|
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + "hash == null");
|
|
|
|
|
} else {
|
|
|
|
|
indexURLEntry.Components comp = entry.comp();
|
|
|
|
|
final indexURLEntry.Components comp = entry.comp();
|
|
|
|
|
totalSearchedUrls++;
|
|
|
|
|
if (entry.hash() == null) {
|
|
|
|
|
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + " hash == null");
|
|
|
|
|
} else if (comp.url() == null) {
|
|
|
|
|
if (comp.url() == null) {
|
|
|
|
|
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double) blacklistedUrls / totalSearchedUrls) * 100 + "%): " + entry.hash() + "URL == null");
|
|
|
|
|
remove(entry.hash());
|
|
|
|
|
lastHash = entry.hash();
|
|
|
|
|
} else if (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, comp.url()) ||
|
|
|
|
|
plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, comp.url())) {
|
|
|
|
|
lastBlacklistedUrl = comp.url().toNormalform();
|
|
|
|
@ -535,11 +534,8 @@ public final class plasmaCrawlLURL {
|
|
|
|
|
serverLog.logInfo("URLDBCLEANER", "Deleted " + blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + lastBlacklistedUrl);
|
|
|
|
|
}
|
|
|
|
|
lastUrl = comp.url().toNormalform();
|
|
|
|
|
lastHash = entry.hash();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
serverLog.logFine("URLDBCLEANER", "Iterator == null");
|
|
|
|
|
lastHash = entry.hash();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (RuntimeException e) {
|
|
|
|
|