From 9d2159582f5f2417787b79aa4a6c5057fe2e56ad Mon Sep 17 00:00:00 2001 From: f1ori Date: Wed, 15 Dec 2010 19:20:00 +0000 Subject: [PATCH] * fix system update if urls are in blacklist (for example for very general blacklists like *.de) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7375 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/DictionaryLoader_p.java | 4 ++-- htroot/Load_RSS_p.java | 2 +- htroot/ViewFile.java | 2 +- htroot/api/ymarks/get_treeview.java | 2 +- htroot/api/ymarks/import_ymark.java | 2 +- source/de/anomic/crawler/CrawlQueues.java | 2 +- source/de/anomic/crawler/RSSLoader.java | 2 +- source/de/anomic/crawler/ZURL.java | 2 +- .../anomic/crawler/retrieval/HTTPLoader.java | 10 +++++----- .../anomic/http/server/HTTPDFileHandler.java | 14 ++++++------- source/de/anomic/search/Switchboard.java | 4 ++-- source/de/anomic/search/TextSnippet.java | 2 +- source/de/anomic/yacy/graphics/OSMTile.java | 2 +- .../importer/OAIListFriendsLoader.java | 2 +- .../yacy/document/importer/OAIPMHLoader.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 20 +++++++++---------- 16 files changed, 37 insertions(+), 37 deletions(-) diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 3cfff407e..0036ec087 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -63,7 +63,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false); byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file())); @@ -103,7 +103,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, false); byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname); diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 310cb40fe..03b5c94e0 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -255,7 +255,7 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true, false)); - Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true); byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (IOException e) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 089bc169d..ab48a4ab2 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -169,7 +169,7 @@ public class ViewFile { Response response = null; try { - response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE); + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CrawlProfile.CacheStrategy.IFEXIST : CrawlProfile.CacheStrategy.CACHEONLY, Long.MAX_VALUE, true); } catch (IOException e) { prop.put("error", "4"); prop.put("error_errorText", "error loading resource: " + e.getMessage()); diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index e8e172899..0e6040fd3 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -186,7 +186,7 @@ public class get_treeview { try { final DigestURI u = new DigestURI(post.get(ROOT).substring(2)); Response response = null; - response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); + response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true); final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); if(document != null) { if(isWordCount) { diff --git a/htroot/api/ymarks/import_ymark.java b/htroot/api/ymarks/import_ymark.java index 0294bd2cd..4fca4d7e7 100644 --- a/htroot/api/ymarks/import_ymark.java +++ b/htroot/api/ymarks/import_ymark.java @@ -92,7 +92,7 @@ public class import_ymark { try { if(!bmk.containsKey(YMarkTables.BOOKMARK.TAGS.key()) || bmk.get(YMarkTables.BOOKMARK.TAGS.key()).equals(YMarkTables.BOOKMARK.TAGS.deflt())) { final DigestURI u = new DigestURI(bmk.get(YMarkTables.BOOKMARK.URL.key())); - Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(u, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true); final Document document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); if(document != null) { bmk.put(YMarkTables.BOOKMARK.TAGS.key(), sb.tables.bookmarks.autoTag(document, bmk_user, 3)); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index fe40aab89..499121918 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -592,7 +592,7 @@ public class CrawlQueues { final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final Map mp = sb.crawler.profilesActiveCrawls.get(request.profileHandle().getBytes()); CrawlProfile e = mp == null ? null : new CrawlProfile(mp); - Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize); + Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/crawler/RSSLoader.java b/source/de/anomic/crawler/RSSLoader.java index 5c02a3043..f1f8637eb 100644 --- a/source/de/anomic/crawler/RSSLoader.java +++ b/source/de/anomic/crawler/RSSLoader.java @@ -56,7 +56,7 @@ public class RSSLoader extends Thread { public void run() { RSSReader rss = null; try { - Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true); byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (MalformedURLException e) { diff --git a/source/de/anomic/crawler/ZURL.java b/source/de/anomic/crawler/ZURL.java index 11ce978c5..3766aef94 100755 --- a/source/de/anomic/crawler/ZURL.java +++ b/source/de/anomic/crawler/ZURL.java @@ -175,7 +175,7 @@ public class ZURL implements Iterable { public ZURL.Entry get(final byte[] urlhash) { try { if (urlIndex == null) return null; - //System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash); + // System.out.println("*** DEBUG ZURL " + this.urlIndex.filename() + " get " + urlhash); final Row.Entry entry = urlIndex.get(urlhash); if (entry == null) return null; return new Entry(entry); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 96b538816..36c784b4b 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -68,14 +68,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry, long maxFileSize) throws IOException { + public Response load(final Request entry, long maxFileSize, boolean checkBlacklist) throws IOException { long start = System.currentTimeMillis(); - Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); + Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist); Latency.update(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, final int retryCount, final long maxFileSize) throws IOException { + private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); @@ -93,7 +93,7 @@ public final class HTTPLoader { // check if url is in blacklist final String hostlow = host.toLowerCase(); - if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { + if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "url in blacklist"); throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); } @@ -164,7 +164,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, retryCount - 1, maxFileSize); + return load(request, retryCount - 1, maxFileSize, checkBlacklist); } else { // no redirection url provided sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "no redirection url provided"); diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index f7060feb7..63631a520 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -458,7 +458,7 @@ public final class HTTPDFileHandler { for (int i = 0; i < list.length; i++) { f = new File(targetFile, list[i]); if (f.isDirectory()) { - aBuffer.append("
  • " + list[i] + "/
  • \n"); + aBuffer.append("
  • " + list[i] + "/
  • \n"); } else { if (list[i].endsWith("html") || (list[i].endsWith("htm"))) { scraper = ContentScraper.parseResource(f); @@ -485,12 +485,12 @@ public final class HTTPDFileHandler { size = (sz / 1024 / 1024) + " MB"; } aBuffer.append("
  • "); - if (headline != null && headline.length() > 0) aBuffer.append("" + headline + "
    "); - aBuffer.append("" + list[i] + "
    "); - if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "
    "); - if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "
    "); - if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "
    "); - aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "
  • \n"); + if (headline != null && headline.length() > 0) aBuffer.append("" + headline + "
    "); + aBuffer.append("" + list[i] + "
    "); + if (author != null && author.length() > 0) aBuffer.append("Author: " + author + "
    "); + if (publisher != null && publisher.length() > 0) aBuffer.append("Publisher: " + publisher + "
    "); + if (description != null && description.length() > 0) aBuffer.append("Description: " + description + "
    "); + aBuffer.append(DateFormatter.formatShortDay(new Date(f.lastModified())) + ", " + size + ((images > 0) ? ", " + images + " images" : "") + ((links > 0) ? ", " + links + " links" : "") + "
    \n"); } } aBuffer.append(" \n\n\n"); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 2f91f57cf..bcefcd0b0 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -2015,7 +2015,7 @@ public final class Switchboard extends serverSwitch { @Override public void run() { try { - final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); + final Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE, true); if (response == null) throw new IOException("response == null"); if (response.getContent() == null) throw new IOException("content == null"); if (response.getResponseHeader() == null) throw new IOException("header == null"); @@ -2364,7 +2364,7 @@ public final class Switchboard extends serverSwitch { // if we have an url then try to load the rss RSSReader rss = null; try { - Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true); byte[] resource = response == null ? null : response.getContent(); //System.out.println("BLEKKO: " + new String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index e1c1cdf3b..51b5360c7 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -176,7 +176,7 @@ public class TextSnippet implements Comparable, Comparator m; for (Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index 23c181c32..d453da3d4 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -48,7 +48,7 @@ public class OAIPMHLoader { this.source = source; // load the file from the net - Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + Response response = loader.load(loader.request(source, false, true), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE, true); byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(source, b); //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 6ad4ca8c3..1271e9f78 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -135,7 +135,7 @@ public final class LoaderDispatcher { public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { - byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize).getContent(); + byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent(); if (b == null) throw new IOException("load == null"); File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -146,7 +146,7 @@ public final class LoaderDispatcher { tmp.renameTo(targetFile); } - public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { + public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException { String url = request.url().toNormalform(true, false); Semaphore check = this.loaderSteering.get(url); if (check != null) { @@ -158,7 +158,7 @@ public final class LoaderDispatcher { try { this.loaderSteering.put(url, new Semaphore(0)); - Response response = loadInternal(request, cacheStrategy, maxFileSize); + Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist); check = this.loaderSteering.remove(url); if (check != null) check.release(1000); return response; @@ -177,7 +177,7 @@ public final class LoaderDispatcher { * @return the loaded entity in a Response object * @throws IOException */ - private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { + private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException { // get the protocol of the next URL final DigestURI url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CrawlProfile.CacheStrategy.NOCACHE; // load just from the file system @@ -261,7 +261,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; - if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize); + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize, checkBlacklist); if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); if (protocol.equals("file")) response = fileLoader.load(request, true); @@ -300,7 +300,7 @@ public final class LoaderDispatcher { public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException { // try to download the resource using the loader final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - final Response entry = load(request, cacheStrategy, maxFileSize); + final Response entry = load(request, cacheStrategy, maxFileSize, false); if (entry == null) return null; // not found in web // read resource body (if it is there) @@ -310,7 +310,7 @@ public final class LoaderDispatcher { public Document[] loadDocuments(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, Parser.Failure { // load resource - final Response response = load(request, cacheStrategy, maxFileSize); + final Response response = load(request, cacheStrategy, maxFileSize, false); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -324,7 +324,7 @@ public final class LoaderDispatcher { public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException { // load page final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - Response r = this.load(request(location, true, false), cachePolicy, maxFileSize); + Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false); byte[] page = (r == null) ? null : r.getContent(); if (page == null) throw new IOException("no response from url " + location.toString()); @@ -343,7 +343,7 @@ public final class LoaderDispatcher { * @throws IOException */ public final Map loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException { - Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE); + Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false); if (response == null) throw new IOException("response == null"); ResponseHeader responseHeader = response.getResponseHeader(); byte[] resource = response.getContent(); @@ -401,7 +401,7 @@ public final class LoaderDispatcher { if (this.cache != null && this.cache.exists()) return; try { // load from the net - Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize); + Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true); byte[] b = response.getContent(); if (this.cache != null) FileUtils.copy(b, this.cache); } catch (MalformedURLException e) {} catch (IOException e) {}