From 1f4412a1467c34594c7da66e79aa8b6fdfe4357a Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 20 Mar 2006 22:31:59 +0000 Subject: [PATCH] adopted isListed to discussed new behavior as discussed (url, getFile) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1940 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 2 +- htroot/yacy/transferURL.java | 2 +- source/de/anomic/plasma/plasmaCrawlLURL.java | 2 +- source/de/anomic/plasma/plasmaCrawlStacker.java | 3 +-- source/de/anomic/plasma/plasmaSwitchboard.java | 2 +- source/de/anomic/plasma/plasmaURLPattern.java | 5 +++++ source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/yacy/yacyClient.java | 2 +- 8 files changed, 12 insertions(+), 8 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 43d789f70..1c5ec3b3c 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -460,7 +460,7 @@ public class IndexControl_p { } else { url = new URL(us); - if (plasmaSwitchboard.urlBlacklist.isListed(url.getHost().toLowerCase(), url.getPath())) { + if (plasmaSwitchboard.urlBlacklist.isListed(url)) { result.append(""); } else { result.append(""); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index 96f8b7c82..b27261fc9 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -96,7 +96,7 @@ public final class transferURL { lEntry = sb.urlPool.loadedURL.newEntry(urls, true); if ((lEntry != null) && (lEntry.url() != null)) { if ((blockBlacklist) && - (plasmaSwitchboard.urlBlacklist.isListed( lEntry.url().getHost().toLowerCase(), lEntry.url().getPath()))) { + (plasmaSwitchboard.urlBlacklist.isListed(lEntry.url()))) { int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + lEntry.url() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); lEntry = null; diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 35ed0cace..f868c3040 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -826,7 +826,7 @@ public final class plasmaCrawlLURL extends plasmaURL { plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next(); totalSearchedUrls++; - if (plasmaSwitchboard.urlBlacklist.isListed(entry.url().getHost().toLowerCase(),entry.url().getPath())==true) { + if (plasmaSwitchboard.urlBlacklist.isListed(entry.url())==true) { lastBlacklistedUrl = entry.url().toString(); lastBlacklistedHash = entry.hash(); serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url()); diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 779872af0..9b182a731 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -271,8 +271,7 @@ public final class plasmaCrawlStacker { } // check blacklist - String hostlow = nexturl.getHost().toLowerCase(); - if (plasmaSwitchboard.urlBlacklist.isListed(hostlow, nexturl.getPath())) { + if (plasmaSwitchboard.urlBlacklist.isListed(nexturl)) { reason = "denied_(url_in_blacklist)"; this.log.logFine("URL '" + nexturlString + "' is in blacklist. " + "Stack processing time: " + (System.currentTimeMillis()-startTime)); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index cb48340fe..59ae31abf 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -187,7 +187,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public plasmaCrawlStacker sbStackCrawlThread; public messageBoard messageDB; public wikiBoard wikiDB; - public blogBoard blogDB; + public blogBoard blogDB; public static plasmaCrawlRobotsTxt robots; public plasmaCrawlProfile profiles; public plasmaCrawlProfile.entry defaultProxyProfile; diff --git a/source/de/anomic/plasma/plasmaURLPattern.java b/source/de/anomic/plasma/plasmaURLPattern.java index 540cf9bc1..515db7340 100644 --- a/source/de/anomic/plasma/plasmaURLPattern.java +++ b/source/de/anomic/plasma/plasmaURLPattern.java @@ -42,6 +42,7 @@ package de.anomic.plasma; import java.io.File; +import java.net.URL; import java.util.HashMap; import de.anomic.kelondro.kelondroMSetTools; @@ -84,6 +85,10 @@ public class plasmaURLPattern { hostpaths.put(host.toLowerCase(), path); } + public boolean isListed(URL url) { + return isListed(url.getHost().toLowerCase(), url.getFile()); + } + public boolean isListed(String hostlow, String path) { if (path.length() > 0 && path.charAt(0) == '/') path = path.substring(1); String pp = ""; // path-pattern diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 0f4d49a63..757e6bef7 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -559,7 +559,7 @@ public final class plasmaWordIndex { // "+entry.getUrlHash()); try { url = lurl.getEntry(entry.getUrlHash(), null).url(); - if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url.getHost().toLowerCase(), url.getPath()) == true)) { + if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(url) == true)) { urlHashs.add(entry.getUrlHash()); } } catch (IOException e) { diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index b6ad658c7..21768a45a 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -465,7 +465,7 @@ public final class yacyClient { for (int n = 0; n < results; n++) { // get one single search result urlEntry = urlManager.newEntry((String) result.get("resource" + n), true); - if (urlEntry != null && blacklist.isListed(urlEntry.url().getHost().toLowerCase(), urlEntry.url().getPath())) { continue; } // block with backlist + if (urlEntry != null && blacklist.isListed(urlEntry.url())) { continue; } // block with backlist urlEntry.store(); int urlLength = urlEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(urlEntry.url().toString()).length;