Merge commit '6d4e08ed06c5cd28c45981b2ebe31c7f7ec6fd83' into quix0r

Conflicts: source/de/anomic/crawler/CrawlQueues.java
13 years ago · 216a287a85
parent d18095dc48 6d4e08ed06
commit 216a287a85
57 changed files with 66 additions and 21 deletions
--- a/htroot/Collage.html
+++ b/htroot/Collage.html
--- a/htroot/Collage.java
+++ b/htroot/Collage.java
--- a/htroot/IndexCleaner_p.html
+++ b/htroot/IndexCleaner_p.html
--- a/htroot/IndexCleaner_p.java
+++ b/htroot/IndexCleaner_p.java
--- a/htroot/api/bookmarks/posts/add_p.java
+++ b/htroot/api/bookmarks/posts/add_p.java
--- a/htroot/api/bookmarks/posts/all.java
+++ b/htroot/api/bookmarks/posts/all.java
--- a/htroot/api/bookmarks/posts/delete_p.java
+++ b/htroot/api/bookmarks/posts/delete_p.java
--- a/htroot/api/bookmarks/posts/get.java
+++ b/htroot/api/bookmarks/posts/get.java
--- a/htroot/api/bookmarks/tags/editTag_p.java
+++ b/htroot/api/bookmarks/tags/editTag_p.java
--- a/htroot/api/bookmarks/tags/getTag.java
+++ b/htroot/api/bookmarks/tags/getTag.java
--- a/htroot/api/bookmarks/xbel/xbel.java
+++ b/htroot/api/bookmarks/xbel/xbel.java
--- a/htroot/api/feed.java
+++ b/htroot/api/feed.java
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
--- a/htroot/api/queues_p.java
+++ b/htroot/api/queues_p.java
--- a/htroot/api/ynetSearch.java
+++ b/htroot/api/ynetSearch.java
--- a/htroot/compare_yacy.html
+++ b/htroot/compare_yacy.html
--- a/htroot/compare_yacy.java
+++ b/htroot/compare_yacy.java
--- a/htroot/processing/domaingraph/applet/domaingraph.java
+++ b/htroot/processing/domaingraph/applet/domaingraph.java
--- a/htroot/processing/domaingraph/applet/index.html
+++ b/htroot/processing/domaingraph/applet/index.html
--- a/htroot/rssTerminal.html
+++ b/htroot/rssTerminal.html
--- a/htroot/terminal_p.html
+++ b/htroot/terminal_p.html
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -706,5 +706,4 @@ public class CrawlQueues {
        }

    }
-
 }
--- a/source/de/anomic/crawler/NoticedURL.java
+++ b/source/de/anomic/crawler/NoticedURL.java
--- a/source/de/anomic/crawler/ResultImages.java
+++ b/source/de/anomic/crawler/ResultImages.java
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
--- a/source/de/anomic/crawler/retrieval/Request.java
+++ b/source/de/anomic/crawler/retrieval/Request.java
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
--- a/source/net/yacy/ai/example/ConnectFour.java
+++ b/source/net/yacy/ai/example/ConnectFour.java
--- a/source/net/yacy/ai/example/testorder.java
+++ b/source/net/yacy/ai/example/testorder.java
--- a/source/net/yacy/ai/greedy/AbstractFinding.java
+++ b/source/net/yacy/ai/greedy/AbstractFinding.java
--- a/source/net/yacy/ai/greedy/AbstractModel.java
+++ b/source/net/yacy/ai/greedy/AbstractModel.java
--- a/source/net/yacy/ai/greedy/Agent.java
+++ b/source/net/yacy/ai/greedy/Agent.java
--- a/source/net/yacy/ai/greedy/Asset.java
+++ b/source/net/yacy/ai/greedy/Asset.java
--- a/source/net/yacy/ai/greedy/Attempts.java
+++ b/source/net/yacy/ai/greedy/Attempts.java
--- a/source/net/yacy/ai/greedy/Battle.java
+++ b/source/net/yacy/ai/greedy/Battle.java
--- a/source/net/yacy/ai/greedy/Challenge.java
+++ b/source/net/yacy/ai/greedy/Challenge.java
--- a/source/net/yacy/ai/greedy/Context.java
+++ b/source/net/yacy/ai/greedy/Context.java
--- a/source/net/yacy/ai/greedy/Engine.java
+++ b/source/net/yacy/ai/greedy/Engine.java
--- a/source/net/yacy/ai/greedy/Finding.java
+++ b/source/net/yacy/ai/greedy/Finding.java
--- a/source/net/yacy/ai/greedy/Goal.java
+++ b/source/net/yacy/ai/greedy/Goal.java
--- a/source/net/yacy/ai/greedy/Model.java
+++ b/source/net/yacy/ai/greedy/Model.java
--- a/source/net/yacy/ai/greedy/Role.java
+++ b/source/net/yacy/ai/greedy/Role.java
--- a/source/net/yacy/cora/protocol/RequestHeader.java
+++ b/source/net/yacy/cora/protocol/RequestHeader.java
--- a/source/net/yacy/cora/protocol/ResponseHeader.java
+++ b/source/net/yacy/cora/protocol/ResponseHeader.java
--- a/source/net/yacy/kelondro/blob/ArrayStack.java
+++ b/source/net/yacy/kelondro/blob/ArrayStack.java
--- a/source/net/yacy/kelondro/blob/Heap.java
+++ b/source/net/yacy/kelondro/blob/Heap.java
--- a/source/net/yacy/kelondro/io/Records.java
+++ b/source/net/yacy/kelondro/io/Records.java
@ -157,7 +157,16 @@ public final class Records {
     * @throws IOException
     */
    private final long filesize() throws IOException {
-        return raf.length() / recordsize;
+        long records = 0;
+
+        try {
+            records = raf.length() / recordsize;
+        } catch (NullPointerException e) {
+            // This may happen on shutdown while still something is moving on
+            Log.logException(e);
+        }
+
+        return records;
    }
    
    /**
--- a/source/net/yacy/kelondro/table/Relations.java
+++ b/source/net/yacy/kelondro/table/Relations.java
--- a/source/net/yacy/kelondro/util/ISO639.java
+++ b/source/net/yacy/kelondro/util/ISO639.java
--- a/source/net/yacy/peers/dht/Dispatcher.java
+++ b/source/net/yacy/peers/dht/Dispatcher.java
--- a/source/net/yacy/peers/dht/FlatWordPartitionScheme.java
+++ b/source/net/yacy/peers/dht/FlatWordPartitionScheme.java
--- a/source/net/yacy/peers/dht/PartitionScheme.java
+++ b/source/net/yacy/peers/dht/PartitionScheme.java
--- a/source/net/yacy/peers/dht/PeerSelection.java
+++ b/source/net/yacy/peers/dht/PeerSelection.java
--- a/source/net/yacy/peers/dht/VerticalWordPartitionScheme.java
+++ b/source/net/yacy/peers/dht/VerticalWordPartitionScheme.java
--- a/source/net/yacy/repository/Blacklist.java
+++ b/source/net/yacy/repository/Blacklist.java
@ -29,7 +29,6 @@ import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
@ -38,6 +37,8 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;

 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.meta.URIMetadataRow;
@ -91,22 +92,22 @@ public class Blacklist {
            }));
    public static final String BLACKLIST_TYPES_STRING = "proxy,crawler,dht,search,surftips,news";
    private File blacklistRootPath = null;
-    private final Map<String, HandleSet> cachedUrlHashs;
-    private final Map<String, Map<String, List<String>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
-    private final Map<String, Map<String, List<String>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
+    private final ConcurrentMap<String, HandleSet> cachedUrlHashs;
+    private final ConcurrentMap<String, ConcurrentMap<String, List<String>>> hostpaths_matchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here
+    private final ConcurrentMap<String, ConcurrentMap<String, List<String>>> hostpaths_notmatchable; // key=host, value=path; mapped url is http://host/path; path does not start with '/' here

    public Blacklist(final File rootPath) {

        setRootPath(rootPath);

        // prepare the data structure
-        this.hostpaths_matchable = new HashMap<String, Map<String, List<String>>>();
-        this.hostpaths_notmatchable = new HashMap<String, Map<String, List<String>>>();
-        this.cachedUrlHashs = new HashMap<String, HandleSet>();
+        this.hostpaths_matchable = new ConcurrentHashMap<String, Map<String, List<String>>>();
+        this.hostpaths_notmatchable = new ConcurrentHashMap<String, Map<String, List<String>>>();
+        this.cachedUrlHashs = new ConcurrentHashMap<String, HandleSet>();

        for (final String blacklistType : BLACKLIST_TYPES) {
-            this.hostpaths_matchable.put(blacklistType, new HashMap<String, List<String>>());
-            this.hostpaths_notmatchable.put(blacklistType, new HashMap<String, List<String>>());
+            this.hostpaths_matchable.put(blacklistType, new ConcurrentHashMap<String, List<String>>());
+            this.hostpaths_notmatchable.put(blacklistType, new ConcurrentHashMap<String, List<String>>());
            this.cachedUrlHashs.put(blacklistType, new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0));
        }
    }
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -64,6 +64,7 @@ import de.anomic.crawler.retrieval.HTTPLoader;
 import de.anomic.crawler.retrieval.Request;
 import de.anomic.crawler.retrieval.Response;
 import de.anomic.crawler.retrieval.SMBLoader;
+import de.anomic.crawler.ZURL.FailCategory;
 import de.anomic.http.client.Cache;

 public final class LoaderDispatcher {
@ -137,7 +138,7 @@ public final class LoaderDispatcher {

    public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException {

-        final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
+        final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, true).getContent();
        if (b == null) throw new IOException("load == null");
        final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");

@ -190,6 +191,12 @@ public final class LoaderDispatcher {
        final String protocol = url.getProtocol();
        final String host = url.getHost();

+        // check if url is in blacklist
+        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
+            this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
+        }
+
        // check if we have the page in the cache
        final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
        if (crawlProfile != null && cacheStrategy != CacheStrategy.NOCACHE) {
@ -324,7 +331,7 @@ public final class LoaderDispatcher {
     */
    public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
        // try to download the resource using the loader
-        final Response entry = load(request, cacheStrategy, false);
+        final Response entry = load(request, cacheStrategy, true);
        if (entry == null) return null; // not found in web

        // read resource body (if it is there)
@ -334,7 +341,7 @@ public final class LoaderDispatcher {
    public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure {

        // load resource
-        final Response response = load(request, cacheStrategy, maxFileSize, false);
+        final Response response = load(request, cacheStrategy, maxFileSize, true);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);

@ -347,7 +354,7 @@ public final class LoaderDispatcher {

    public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
        // load page
-        final Response r = this.load(request(location, true, false), cachePolicy, false);
+        final Response r = this.load(request(location, true, false), cachePolicy, true);
        final byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());

@ -366,7 +373,7 @@ public final class LoaderDispatcher {
     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException {
-        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, false);
+        final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, true);
        if (response == null) throw new IOException("response == null");
        final ResponseHeader responseHeader = response.getResponseHeader();
        if (response.getContent() == null) throw new IOException("resource == null");
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@ -27,6 +27,7 @@ package net.yacy.search.snippet;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Comparator;
+import java.util.Date;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@ -48,7 +49,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
 import net.yacy.kelondro.logging.Log;
 import net.yacy.kelondro.order.Base64Order;
 import net.yacy.kelondro.util.ByteArray;
+import net.yacy.repository.Blacklist;
 import net.yacy.search.Switchboard;
+import de.anomic.crawler.retrieval.Request;
+import de.anomic.crawler.ZURL.FailCategory;


 public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -165,6 +169,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
            entry = i.next();
            url = new DigestURI(entry.getKey());
            desc = entry.getValue();
+            if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
            final int ranking = removeAppearanceHashes(url.toNormalform(false, false), queryhashes).size() +
                           removeAppearanceHashes(desc, queryhashes).size();
            if (ranking < 2 * queryhashes.size()) {
@ -189,6 +194,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
            ientry = i.next();
            url = new DigestURI(ientry.url());
            final String u = url.toString();
+            if (isUrlBlacklisted(url, Blacklist.BLACKLIST_SEARCH)) continue;
            if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
            if (ientry.height() > 0 && ientry.height() < 32) continue;
            if (ientry.width() > 0 && ientry.width() < 32) continue;
@ -230,4 +236,27 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
        return remaininghashes;
    }

+    /**
+     * Checks wether given URL is in blacklist for given blacklist type
+     *
+     * @param   url     The URL to check
+     * @param   blacklistType   Type of blacklist (see class Blacklist, BLACKLIST_FOO)
+     * @return  isBlacklisted   Wether the given URL is blacklisted
+     */
+    private static boolean isUrlBlacklisted (DigestURI url, String blacklistType) {
+        // Default is not blacklisted
+        boolean isBlacklisted = false;
+
+        // check if url is in blacklist
+        if (Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
+            Switchboard.getSwitchboard().crawlQueues.errorURL.push(new Request(url, null), Switchboard.getSwitchboard().peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
+            Log.logFine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
+            isBlacklisted = true;
+        }
+
+        // Return result
+        return isBlacklisted;
+    }
+
 }
+