Blacklist checks are now always turned on, in media searches (e.g. image search) images matching blacklist entries are no longer shown to the user

13 years ago · a3083d13bf
parent 3e5ac15c71
commit a3083d13bf
2 changed files with 42 additions and 6 deletions
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -64,6 +64,7 @@ import de.anomic.crawler.retrieval.HTTPLoader;
 import de.anomic.crawler.retrieval.Request;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.crawler.retrieval.SMBLoader;
+import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.http.client.Cache;

 public final class LoaderDispatcher {
@ -137,7 +138,7 @@ public final class LoaderDispatcher {

    public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException {

-        final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
+        final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, true).getContent();
        if (b == null) throw new IOException("load == null");
        final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");

@ -190,6 +191,12 @@ public final class LoaderDispatcher {
        final String protocol = url.getProtocol();
        final String host = url.getHost();

+        // check if url is in blacklist
+        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
+            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+        }
+
        // check if we have the page in the cache
        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
@ -314,7 +321,7 @@ public final class LoaderDispatcher {
     */
    public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
        // try to download the resource using the loader
-        final Response entry = load(request, cacheStrategy, false);
+        final Response entry = load(request, cacheStrategy, true);
        if (entry == null) return null; // not found in web

        // read resource body (if it is there)
@ -324,7 +331,7 @@ public final class LoaderDispatcher {
    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure {

        // load resource
-        final Response response = load(request, cacheStrategy, maxFileSize, false);
+        final Response response = load(request, cacheStrategy, maxFileSize, true);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);

@ -337,7 +344,7 @@ public final class LoaderDispatcher {

    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
        // load page
-        final Response r = this.load(request(location, true, false), cachePolicy, false);
+        final Response r = this.load(request(location, true, false), cachePolicy, true);
        final byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());

@ -356,7 +363,7 @@ public final class LoaderDispatcher {
     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
-        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, false);
+        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, true);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        if (response.getContent() == null) throw new IOException("resource == null");
@ -417,4 +424,4 @@ public final class LoaderDispatcher {
            } catch (final MalformedURLException e) {} catch (final IOException e) {}
        }
    }
-}
+}
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@ -27,6 +27,7 @@ package net.yacy.search.snippet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
+import java.util.Date;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -48,7 +49,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.util.ByteArray;
+import net.yacy.repository.Blacklist;
 import net.yacy.search.Switchboard;
+import de.anomic.crawler.retrieval.Request;
+import de.anomic.crawler.ZURL.FailCategory;


 public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -165,6 +169,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
            entry = i.next();
            url = new DigestURI(entry.getKey());
            desc = entry.getValue();
+            if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
            final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
                           removeAppearanceHashes(desc, queryhashes).size();
            if (ranking < 2 * queryhashes.size()) {
@ -189,6 +194,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
            ientry = i.next();
            url = new DigestURI(ientry.url());
            final String u = url.toString();
+            if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
            if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
            if (ientry.height() > 0 && ientry.height() < 32) continue;
            if (ientry.width() > 0 && ientry.width() < 32) continue;
@ -230,4 +236,27 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
        return remaininghashes;
    }

+    /**
+     * Checks wether given URL is in blacklist for given blacklist type
+     *
+     * @param   url     The URL to check
+     * @param   blacklistType   Type of blacklist (see class Blacklist, BLACKLIST_FOO)
+     * @return  isBlacklisted   Wether the given URL is blacklisted
+     */
+    private static boolean isUrlBlacklisted (DigestURI url, String blacklistType) {
+        // Default is not blacklisted
+        boolean isBlacklisted = false;
+
+        // check if url is in blacklist
+        if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
+            Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), Switchboard.getSwitchboard().peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            Log.logFine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
+            isBlacklisted = true;
+        }
+
+        // Return result
+        return isBlacklisted;
+    }
+
 }
+