* fix system update if urls are in blacklist (for example for very general blacklists like *.de)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7375 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 9d2159582f
parent 56264dcc17
commit 9d2159582f
16 changed files with 37 additions and 37 deletions
--- a/htroot/DictionaryLoader_p.java
+++ b/htroot/DictionaryLoader_p.java
@ -63,7 +63,7 @@ public class DictionaryLoader_p {
        if (post.containsKey("geon0Load")) {
            // load from the net
            try {
-                Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+                Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false);
                byte[] b = response.getContent();
                FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
                LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file()));
@ -103,7 +103,7 @@ public class DictionaryLoader_p {
        if (post.containsKey("geo1Load")) {
            // load from the net
            try {
-                Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+                Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false);
                byte[] b = response.getContent();
                FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
                LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname);
--- a/htroot/Load_RSS_p.java
+++ b/htroot/Load_RSS_p.java
@ -255,7 +255,7 @@ public class Load_RSS_p {
        RSSReader rss = null;
        if (url != null) try {
            prop.put("url", url.toNormalform(true, false));
-            Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+            Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
            byte[] resource = response == null ? null : response.getContent();
            rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
        } catch (IOException e) {
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -169,7 +169,7 @@ public class ViewFile {
        
        Response response = null;
        try {
-            response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE);
+            response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE, true);
        } catch (IOException e) {
            prop.put("error", "4");
            prop.put("error_errorText", "error loading resource: " + e.getMessage());
--- a/htroot/api/ymarks/get_treeview.java
+++ b/htroot/api/ymarks/get_treeview.java
@ -186,7 +186,7 @@ public class get_treeview {
 	        	try {
 	                final DigestURI u = new DigestURI(post.get(ROOT).substring(2));
 	                Response response = null;
-        			response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
+        			response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
        			final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
        			if(document != null) {
    	        		if(isWordCount)  {
--- a/htroot/api/ymarks/import_ymark.java
+++ b/htroot/api/ymarks/import_ymark.java
@ -92,7 +92,7 @@ public class import_ymark {
 		try {
 			if(!bmk.containsKey(YMarkTables.BOOKMARK.TAGS.key()) || bmk.get(YMarkTables.BOOKMARK.TAGS.key()).equals(YMarkTables.BOOKMARK.TAGS.deflt())) {
 	            final DigestURI u = new DigestURI(bmk.get(YMarkTables.BOOKMARK.URL.key()));
-	            Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
+	            Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
 				final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
 				if(document != null) {
 					bmk.put(YMarkTables.BOOKMARK.TAGS.key(), sb.tables.bookmarks.autoTag(document, bmk_user, 3));
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@ -592,7 +592,7 @@ public class CrawlQueues {
                        final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
                        final Map<String, String> mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes());
                        CrawlProfile e = mp == null ? null : new CrawlProfile(mp);
-                        Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
+                        Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true);
                        if (response == null) {
                            request.setStatus("error", WorkflowJob.STATUS_FINISHED);
                            if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
--- a/source/de/anomic/crawler/RSSLoader.java
+++ b/source/de/anomic/crawler/RSSLoader.java
@ -56,7 +56,7 @@ public class RSSLoader extends Thread {
    public void run() {
        RSSReader rss = null;
        try {
-            Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+            Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
            byte[] resource = response == null ? null : response.getContent();
            rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
        } catch (MalformedURLException e) {
--- a/source/de/anomic/crawler/ZURL.java
+++ b/source/de/anomic/crawler/ZURL.java
@ -175,7 +175,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
    public ZURL.Entry get(final byte[] urlhash) {
        try {
            if (urlIndex == null) return null;
-            //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
+            // System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash);
            final Row.Entry entry = urlIndex.get(urlhash);
            if (entry == null) return null;
            return new Entry(entry);
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@ -68,14 +68,14 @@ public final class HTTPLoader {
        this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
    }  
   
-    public Response load(final Request entry, long maxFileSize) throws IOException {
+    public Response load(final Request entry, long maxFileSize, boolean checkBlacklist) throws IOException {
        long start = System.currentTimeMillis();
-        Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
+        Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist);
        Latency.update(entry.url(), System.currentTimeMillis() - start);
        return doc;
    }
    
-    private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException {
+    private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException {

        if (retryCount < 0) {
            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -93,7 +93,7 @@ public final class HTTPLoader {
        
        // check if url is in blacklist
        final String hostlow = host.toLowerCase();
-        if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
+        if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist");
            throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
        }
@ -164,7 +164,7 @@ public final class HTTPLoader {
                    
                    // retry crawling with new url
                    request.redirectURL(redirectionUrl);
-                    return load(request, retryCount - 1, maxFileSize);
+                    return load(request, retryCount - 1, maxFileSize, checkBlacklist);
                } else {
                	// no redirection url provided
                    sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided");
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@ -458,7 +458,7 @@ public final class HTTPDFileHandler {
                    for (int i = 0; i < list.length; i++) {
                        f = new File(targetFile, list[i]);
                        if (f.isDirectory()) {
-                            aBuffer.append("    <li><a href=\"" + path + list[i] + "/\">" + list[i] + "/</a><br></li>\n");
+                            aBuffer.append("    <li><a href=\"" + path + list[i] + "/\">" + list[i] + "/</a><br/></li>\n");
                        } else {
                            if (list[i].endsWith("html") || (list[i].endsWith("htm"))) {
                                scraper = ContentScraper.parseResource(f);
@ -485,12 +485,12 @@ public final class HTTPDFileHandler {
                                size = (sz / 1024 / 1024) + " MB";
                            }
                            aBuffer.append("    <li>");
-                            if (headline != null && headline.length() > 0) aBuffer.append("<a href=\"" + list[i] + "\"><b>" + headline + "</b></a><br>");
-                            aBuffer.append("<a href=\"" + path + list[i] + "\">" + list[i] + "</a><br>");
-                            if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br>");
-                            if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br>");
-                            if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br>");
-                            aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br></li>\n");
+                            if (headline != null && headline.length() > 0) aBuffer.append("<a href=\"" + list[i] + "\"><b>" + headline + "</b></a><br/>");
+                            aBuffer.append("<a href=\"" + path + list[i] + "\">" + list[i] + "</a><br/>");
+                            if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "<br/>");
+                            if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "<br/>");
+                            if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "<br/>");
+                            aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "<br/></li>\n");
                        }
                    }
                    aBuffer.append("  </ul>\n</body>\n</html>\n");
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -2015,7 +2015,7 @@ public final class Switchboard extends serverSwitch {
            @Override
            public void run() {
                try {
-                    final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE);
+                    final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE, true);
                    if (response == null) throw new IOException("response == null");
                    if (response.getContent() == null) throw new IOException("content == null");
                    if (response.getResponseHeader() == null) throw new IOException("header == null");
@ -2364,7 +2364,7 @@ public final class Switchboard extends serverSwitch {
                // if we have an url then try to load the rss
                RSSReader rss = null;
                try {
-                    Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+                    Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
                    byte[] resource = response == null ? null : response.getContent();
                    //System.out.println("BLEKKO: " + new String(resource));
                    rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -176,7 +176,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                return;
            } else {
                // try to load the resource from the cache
-                response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE);
+                response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
                if (response == null) {
                    // in case that we did not get any result we can still return a success when we are not allowed to go online
                    if (cacheStrategy.mustBeOffline()) {
--- a/source/de/anomic/yacy/graphics/OSMTile.java
+++ b/source/de/anomic/yacy/graphics/OSMTile.java
@ -112,7 +112,7 @@ public class OSMTile {
            // download resource using the crawler and keep resource in memory if possible
            Response entry = null;
            try {
-                entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
+                entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
            } catch (IOException e) {
                Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
                return null;
--- a/source/net/yacy/document/importer/OAIListFriendsLoader.java
+++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java
@ -81,7 +81,7 @@ public class OAIListFriendsLoader {
        Map<String, String> m;
        for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
            if (!oaiFriend.getValue().exists()) {
-                Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+                Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
                if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
            }
            
--- a/source/net/yacy/document/importer/OAIPMHLoader.java
+++ b/source/net/yacy/document/importer/OAIPMHLoader.java
@ -48,7 +48,7 @@ public class OAIPMHLoader {
        this.source = source;
        
        // load the file from the net
-        Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+        Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true);
        byte[] b = response.getContent();
        this.resumptionToken = new ResumptionToken(source, b);
        //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -135,7 +135,7 @@ public final class LoaderDispatcher {

    public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {

-        byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent();
+        byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
        if (b == null) throw new IOException("load == null");
        File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
        
@ -146,7 +146,7 @@ public final class LoaderDispatcher {
        tmp.renameTo(targetFile);
    }
    
-    public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
+    public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
        String url = request.url().toNormalform(true, false);
        Semaphore check = this.loaderSteering.get(url);
        if (check != null) {
@ -158,7 +158,7 @@ public final class LoaderDispatcher {
        
        try {
            this.loaderSteering.put(url, new Semaphore(0));
-            Response response = loadInternal(request, cacheStrategy, maxFileSize);
+            Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
            check = this.loaderSteering.remove(url);
            if (check != null) check.release(1000);
            return response;
@ -177,7 +177,7 @@ public final class LoaderDispatcher {
     * @return the loaded entity in a Response object
     * @throws IOException
     */
-    private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
+    private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
        // get the protocol of the next URL
        final DigestURI url = request.url();
        if (url.isFile() || url.isSMB()) cacheStrategy = CrawlProfile.CacheStrategy.NOCACHE; // load just from the file system
@ -261,7 +261,7 @@ public final class LoaderDispatcher {
        
        // load resource from the internet
        Response response = null;
-        if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize);
+        if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize, checkBlacklist);
        if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
        if (protocol.equals("smb")) response = smbLoader.load(request, true);
        if (protocol.equals("file")) response = fileLoader.load(request, true);
@ -300,7 +300,7 @@ public final class LoaderDispatcher {
    public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
        // try to download the resource using the loader
        final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-        final Response entry = load(request, cacheStrategy, maxFileSize);
+        final Response entry = load(request, cacheStrategy, maxFileSize, false);
        if (entry == null) return null; // not found in web
        
        // read resource body (if it is there)
@ -310,7 +310,7 @@ public final class LoaderDispatcher {
    public Document[] loadDocuments(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, Parser.Failure {

        // load resource
-        final Response response = load(request, cacheStrategy, maxFileSize);
+        final Response response = load(request, cacheStrategy, maxFileSize, false);
        final DigestURI url = request.url();
        if (response == null) throw new IOException("no Response for url " + url);

@ -324,7 +324,7 @@ public final class LoaderDispatcher {
    public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
        // load page
        final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
-        Response r = this.load(request(location, true, false), cachePolicy, maxFileSize);
+        Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
        byte[] page = (r == null) ? null : r.getContent();
        if (page == null) throw new IOException("no response from url " + location.toString());
        
@ -343,7 +343,7 @@ public final class LoaderDispatcher {
     * @throws IOException
     */
    public final Map<MultiProtocolURI, String> loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
-        Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE);
+        Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
        if (response == null) throw new IOException("response == null");
        ResponseHeader responseHeader = response.getResponseHeader();
        byte[] resource = response.getContent();
@ -401,7 +401,7 @@ public final class LoaderDispatcher {
            if (this.cache != null && this.cache.exists()) return;
            try {
                // load from the net
-                Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize);
+                Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
                byte[] b = response.getContent();
                if (this.cache != null) FileUtils.copy(b, this.cache);
            } catch (MalformedURLException e) {} catch (IOException e) {}