From 161d2fd2efdad216672d4e7550cc6d89b852b330 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 23 Jul 2009 21:31:51 +0000 Subject: [PATCH] redesign of access to the HTCache (now http.client.Cache): - better control to the cache by using combined request-header and content access methods - refactoring of many classes to comply to this new access method - make shure that the cache is always written if something was loaded - some redesign of the process how http response results are feeded into the new indexing queue - introduction of a cache read policy: * never use the cache * use the cache if entry exist * use the cache if the proxy freshness rule confirmes * use only the cache and go never online - added configuration options for the crawl profiles to use the new cache policies. There is not yet a input during crawl start to set the policy but this will be added in another step. - set the default policies for the existing crawl profiles. If you want them to appear in your default profiles you must delete the crawl profiles database; othervise the policy is 'proxy freshness rule' - enhanced some cache access methods in such a way that unnecessary retrievals are omitted (i.e. for size computation). That should reduce some IO but also a lot of CPU computation because sizes were computed after decompression of content after retrieval of the content from the disc. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6239 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/QuickCrawlLink_p.java | 3 +- htroot/ViewFile.java | 10 +- htroot/WatchCrawler_p.java | 9 +- source/de/anomic/crawler/CrawlProfile.java | 26 ++- source/de/anomic/crawler/CrawlQueues.java | 91 +++++------ source/de/anomic/crawler/CrawlStacker.java | 4 +- .../de/anomic/crawler/CrawlSwitchboard.java | 15 +- .../anomic/crawler/retrieval/FTPLoader.java | 34 ++-- .../anomic/crawler/retrieval/HTTPLoader.java | 23 ++- .../crawler/retrieval/LoaderDispatcher.java | 152 ++++++++++++++---- .../de/anomic/crawler/retrieval/Request.java | 6 +- .../de/anomic/crawler/retrieval/Response.java | 35 ++-- source/de/anomic/data/SitemapParser.java | 3 +- source/de/anomic/data/bookmarksDB.java | 18 ++- source/de/anomic/http/client/Cache.java | 68 ++++++-- .../anomic/http/client/MultiOutputStream.java | 17 +- .../anomic/http/server/HTTPDProxyHandler.java | 44 +++-- .../de/anomic/kelondro/blob/Compressor.java | 2 +- source/de/anomic/search/SnippetCache.java | 28 ++-- source/de/anomic/search/Switchboard.java | 143 +++++----------- source/de/anomic/ymage/ymageOSM.java | 4 +- 21 files changed, 415 insertions(+), 320 deletions(-) diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 0ce24d830..504259e4a 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -147,7 +147,8 @@ public class QuickCrawlLink_p { remoteIndexing, xsstopw, xdstopw, - xpstopw + xpstopw, + CrawlProfile.CACHE_STRATEGY_IFFRESH ); } catch (final Exception e) { // mist diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 856d3a9bb..7cb38e957 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -146,15 +146,15 @@ public class ViewFile { ResponseHeader responseHeader = null; String resMime = null; // trying to load the resource body - resource = Cache.getResourceContentStream(url); + resource = Cache.getContentStream(url); resourceLength = Cache.getResourceContentLength(url); - responseHeader = Cache.loadResponseHeader(url); + responseHeader = Cache.getResponseHeader(url); // if the resource body was not cached we try to load it from web if (resource == null) { Response entry = null; try { - entry = sb.crawlQueues.loadResourceFromWeb(url, true, false); + entry = sb.loader.load(url, true, false); } catch (final Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); @@ -163,7 +163,7 @@ public class ViewFile { } if (entry != null) { - resource = Cache.getResourceContentStream(url); + resource = Cache.getContentStream(url); resourceLength = Cache.getResourceContentLength(url); } @@ -180,7 +180,7 @@ public class ViewFile { // try to load the metadata from cache try { - responseHeader = Cache.loadResponseHeader(url); + responseHeader = Cache.getResponseHeader(url); } catch (final Exception e) { /* ignore this */ } diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index f30fdd5b5..268bed847 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -224,7 +224,7 @@ public class WatchCrawler_p { crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH); final String reasonString = sb.crawlStacker.stackCrawl(new Request( sb.peers.mySeed().hash, url, @@ -350,7 +350,8 @@ public class WatchCrawler_p { storeHTCache, true, crawlOrder, - xsstopw, xdstopw, xpstopw); + xsstopw, xdstopw, xpstopw, + CrawlProfile.CACHE_STRATEGY_IFFRESH); // pause local crawl here sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); @@ -408,7 +409,9 @@ public class WatchCrawler_p { crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + storeHTCache, true, crawlOrder, + xsstopw, xdstopw, xpstopw, + CrawlProfile.CACHE_STRATEGY_IFFRESH); // create a new sitemap importer final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 1fa458a5d..3d2491cd5 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -168,7 +168,8 @@ public class CrawlProfile { final boolean indexText, final boolean indexMedia, final boolean storeHTCache, final boolean storeTXCache, final boolean remoteIndexing, - final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) { + final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, + final int cacheStrategy) { final entry ne = new entry( name, startURL, @@ -179,7 +180,8 @@ public class CrawlProfile { indexText, indexMedia, storeHTCache, storeTXCache, remoteIndexing, - xsstopw, xdstopw, xpstopw); + xsstopw, xdstopw, xpstopw, + cacheStrategy); try { profileTable.put(ne.handle(), ne.map()); } catch (final kelondroException e) { @@ -247,6 +249,11 @@ public class CrawlProfile { } + public final static int CACHE_STRATEGY_NOCACHE = 0; + public final static int CACHE_STRATEGY_IFEXIST = 1; + public final static int CACHE_STRATEGY_IFFRESH = 2; + public final static int CACHE_STRATEGY_CACHEONLY = 3; + public static class entry { // this is a simple record structure that hold all properties of a single crawl start @@ -268,6 +275,7 @@ public class CrawlProfile { public static final String XSSTOPW = "xsstopw"; public static final String XDSTOPW = "xdstopw"; public static final String XPSTOPW = "xpstopw"; + public static final String CACHE_STRAGEGY = "cacheStrategy"; Map mem; private ConcurrentHashMap doms; @@ -284,7 +292,8 @@ public class CrawlProfile { final boolean indexText, final boolean indexMedia, final boolean storeHTCache, final boolean storeTXCache, final boolean remoteIndexing, - final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) { + final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, + final int cacheStrategy) { if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, yacySeedDB.commonHashLength) : startURL.hash(); mem = new HashMap(); @@ -306,7 +315,7 @@ public class CrawlProfile { mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words - + mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy)); doms = new ConcurrentHashMap(); } @@ -368,6 +377,15 @@ public class CrawlProfile { return 0; } } + public int cacheStrategy() { + final String r = mem.get(CACHE_STRAGEGY); + if (r == null) return CACHE_STRATEGY_IFFRESH; + try { + return Integer.parseInt(r); + } catch (final NumberFormatException e) { + return CACHE_STRATEGY_IFFRESH; + } + } public long recrawlIfOlder() { // returns a long (millis) that is the minimum age that // an entry must have to be re-crawled diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 43d7e40d7..0c23af58f 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -38,7 +38,6 @@ import java.util.concurrent.ConcurrentHashMap; import de.anomic.content.RSSMessage; import de.anomic.crawler.retrieval.Request; -import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.crawler.retrieval.Response; import de.anomic.document.parser.xml.RSSFeed; import de.anomic.http.client.Client; @@ -59,7 +58,6 @@ public class CrawlQueues { protected Switchboard sb; protected Log log; protected Map workers; // mapping from url hash to Worker thread object - protected LoaderDispatcher loader; private final ArrayList remoteCrawlProviderHashes; public NoticedURL noticeURL; @@ -69,7 +67,6 @@ public class CrawlQueues { this.sb = sb; this.log = new Log("CRAWLER"); this.workers = new ConcurrentHashMap(); - this.loader = new LoaderDispatcher(sb, log); this.remoteCrawlProviderHashes = new ArrayList(); // start crawling management @@ -94,7 +91,7 @@ public class CrawlQueues { if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; for (final crawlWorker worker: workers.values()) { - if (worker.entry.url().hash().equals(hash)) return "worker"; + if (worker.request.url().hash().equals(hash)) return "worker"; } return null; } @@ -115,7 +112,7 @@ public class CrawlQueues { ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); for (final crawlWorker w: workers.values()) { - if (w.entry.url().hash().equals(urlhash)) return w.entry.url(); + if (w.request.url().hash().equals(urlhash)) return w.request.url(); } return null; } @@ -170,15 +167,11 @@ public class CrawlQueues { synchronized (workers) { final Request[] e = new Request[workers.size()]; int i = 0; - for (final crawlWorker w: workers.values()) e[i++] = w.entry; + for (final crawlWorker w: workers.values()) e[i++] = w.request; return e; } } - public boolean isSupportedProtocol(final String protocol) { - return loader.isSupportedProtocol(protocol); - } - public int coreCrawlJobSize() { return noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE); } @@ -243,7 +236,7 @@ public class CrawlQueues { // check if the protocol is supported final yacyURL url = urlEntry.url(); final String urlProtocol = url.getProtocol(); - if (this.isSupportedProtocol(urlProtocol)) { + if (sb.loader.isSupportedProtocol(urlProtocol)) { if (this.log.isFine()) log.logFine(stats + ": URL=" + urlEntry.url() @@ -494,48 +487,20 @@ public class CrawlQueues { } } - public Response loadResourceFromWeb( - final yacyURL url, - final boolean forText, - final boolean global - ) throws IOException { - - final Request centry = new Request( - sb.peers.mySeed().hash, - url, - "", - "", - new Date(), - new Date(), - (forText) ? - ((global) ? - sb.crawler.defaultTextSnippetGlobalProfile.handle() : - sb.crawler.defaultTextSnippetLocalProfile.handle()) - : - ((global) ? - sb.crawler.defaultMediaSnippetGlobalProfile.handle() : - sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile - 0, - 0, - 0); - - return loader.load(centry); - } - public int size() { return workers.size(); } protected final class crawlWorker extends Thread { - protected Request entry; + protected Request request; private final Integer code; private long start; public crawlWorker(final Request entry) { this.start = System.currentTimeMillis(); - this.entry = entry; - this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED); + this.request = entry; + this.request.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED); this.code = Integer.valueOf(entry.hashCode()); if (!workers.containsKey(code)) { workers.put(code, this); @@ -550,39 +515,57 @@ public class CrawlQueues { public void run() { try { // checking robots.txt for http(s) resources - this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED); - if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) { - if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt."); + this.request.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED); + if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) { + if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt."); final ZURL.Entry eentry = errorURL.newEntry( - this.entry, + this.request, sb.peers.mySeed().hash, new Date(), 1, "denied by robots.txt"); eentry.store(); errorURL.push(eentry); - this.entry.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED); + this.request.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED); } else { // starting a load from the internet - this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); - final String result = loader.process(this.entry); + this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); + String result = null; + + // load a resource, store it to htcache and push queue entry to switchboard queue + // returns null if everything went fine, a fail reason string if a problem occurred + Response response; + try { + request.setStatus("loading", serverProcessorJob.STATUS_RUNNING); + response = sb.loader.load(request); + assert response != null; + request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); + final boolean stored = sb.toIndexer(response); + request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); + result = (stored) ? null : "not enqueued to indexer"; + } catch (IOException e) { + request.setStatus("error", serverProcessorJob.STATUS_FINISHED); + if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage()); + result = "load error - " + e.getMessage(); + } + if (result != null) { final ZURL.Entry eentry = errorURL.newEntry( - this.entry, + this.request, sb.peers.mySeed().hash, new Date(), 1, "cannot load: " + result); eentry.store(); errorURL.push(eentry); - this.entry.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED); + this.request.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED); } else { - this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED); + this.request.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED); } } } catch (final Exception e) { final ZURL.Entry eentry = errorURL.newEntry( - this.entry, + this.request, sb.peers.mySeed().hash, new Date(), 1, @@ -591,7 +574,7 @@ public class CrawlQueues { errorURL.push(eentry); e.printStackTrace(); Client.initConnectionManager(); - this.entry.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED); + this.request.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED); } finally { crawlWorker w = workers.remove(code); assert w != null; diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 89fb638f4..3afc4fcd4 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -46,7 +46,7 @@ public final class CrawlStacker { private Log log = new Log("STACKCRAWL"); - private serverProcessor fastQueue, slowQueue; + private serverProcessor fastQueue, slowQueue; private long dnsHit, dnsMiss; private CrawlQueues nextQueue; private CrawlSwitchboard crawler; @@ -177,7 +177,7 @@ public final class CrawlStacker { // check if the protocol is supported final String urlProtocol = entry.url().getProtocol(); - if (!nextQueue.isSupportedProtocol(urlProtocol)) { + if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " + "Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms"); return "unsupported protocol"; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 92ac5babd..b2807ac9f 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -178,37 +178,38 @@ public final class CrawlSwitchboard { true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, true, - false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true); + false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, + CrawlProfile.CACHE_STRATEGY_IFFRESH); } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - -1, -1, -1, true, true, true, false, true, false, true, true, false); + -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY); } if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE); } } diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 54092208b..85c917337 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -34,7 +34,6 @@ import java.util.Date; import de.anomic.crawler.Latency; import de.anomic.document.Parser; -import de.anomic.http.client.Cache; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseHeader; @@ -110,20 +109,22 @@ public class FTPLoader { // directory -> get list of files RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false)); - ResponseHeader responseHeader = new ResponseHeader(); - responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); - responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); - response = new Response( - request, - requestHeader, - responseHeader, - "OK", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())); - Cache.storeMetadata(request.url(), responseHeader); byte[] dirList = generateDirlist(ftpClient, request, path); + if (dirList == null) { response = null; + } else { + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + response = new Response( + request, + requestHeader, + responseHeader, + "OK", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + dirList); } } else { // file -> download @@ -236,6 +237,9 @@ public class FTPLoader { // determine the file date final Date fileDate = ftpClient.entryDate(path); + // download the remote file + byte[] b = ftpClient.get(path); + // create a cache entry RequestHeader requestHeader = new RequestHeader(); if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false)); @@ -247,12 +251,8 @@ public class FTPLoader { requestHeader, responseHeader, "OK", - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())); - Cache.storeMetadata(request.url(), responseHeader); - - // download the remote file - byte[] b = ftpClient.get(path); - response.setContent(b); + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + b); } else { log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString()); sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index c9e716f9d..1668c701f 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -32,7 +32,6 @@ import de.anomic.crawler.Latency; import de.anomic.data.Blacklist; import de.anomic.document.Parser; import de.anomic.http.client.Client; -import de.anomic.http.client.Cache; import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.http.metadata.ResponseContainer; @@ -135,18 +134,6 @@ public final class HTTPLoader { if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { // the transfer is ok - // create a new cache entry - response = new Response( - request, - requestHeader, - res.getResponseHeader(), - res.getStatusLine(), - sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()) - ); - Cache.storeMetadata(request.url(), res.getResponseHeader()); - - // request has been placed and result has been returned. work off response - // if the response has not the right file type then reject file supportError = Parser.supports(request.url(), res.getResponseHeader().mime()); if (supportError != null) { @@ -165,7 +152,15 @@ public final class HTTPLoader { throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)"); } - response.setContent(responseBody); + // create a new cache entry + response = new Response( + request, + requestHeader, + res.getResponseHeader(), + res.getStatusLine(), + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + responseBody + ); return response; } else if (res.getStatusLine().startsWith("30")) { diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 7290a19a6..6ba9e6627 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -28,14 +28,20 @@ package de.anomic.crawler.retrieval; import java.io.IOException; import java.util.Arrays; +import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.crawler.CrawlProfile; +import de.anomic.http.client.Cache; +import de.anomic.http.metadata.HeaderFramework; +import de.anomic.http.metadata.RequestHeader; +import de.anomic.http.metadata.ResponseHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverCore; -import de.anomic.server.serverProcessorJob; +import de.anomic.yacy.yacyURL; import de.anomic.yacy.logging.Log; public final class LoaderDispatcher { @@ -44,17 +50,17 @@ public final class LoaderDispatcher { private static final ConcurrentHashMap accessTime = new ConcurrentHashMap(); // to protect targets from DDoS private final Switchboard sb; - private final Log log; private final HashSet supportedProtocols; private final HTTPLoader httpLoader; private final FTPLoader ftpLoader; + private final Log log; - public LoaderDispatcher(final Switchboard sb, final Log log) { + public LoaderDispatcher(final Switchboard sb) { this.sb = sb; - this.log = log; this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"})); // initiate loader objects + this.log = new Log("LOADER"); httpLoader = new HTTPLoader(sb, log); ftpLoader = new FTPLoader(sb, log); } @@ -69,17 +75,100 @@ public final class LoaderDispatcher { return (HashSet) this.supportedProtocols.clone(); } - public Response load(final Request entry) throws IOException { - // getting the protocol of the next URL - final String protocol = entry.url().getProtocol(); - final String host = entry.url().getHost(); + public Response load( + final yacyURL url, + final boolean forText, + final boolean global + ) throws IOException { + + final Request centry = new Request( + sb.peers.mySeed().hash, + url, + "", + "", + new Date(), + new Date(), + (forText) ? + ((global) ? + sb.crawler.defaultTextSnippetGlobalProfile.handle() : + sb.crawler.defaultTextSnippetLocalProfile.handle()) + : + ((global) ? + sb.crawler.defaultMediaSnippetGlobalProfile.handle() : + sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile + 0, + 0, + 0); + + return load(centry); + } + + public Response load(final Request request) throws IOException { + // get the protocol of the next URL + final String protocol = request.url().getProtocol(); + final String host = request.url().getHost(); // check if this loads a page from localhost, which must be prevented to protect the server // against attacks to the administration interface when localhost access is granted - if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + entry.url()); + if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url()); + + // check if we have the page in the cache + + CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); + int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE; + if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) { + // we have passed a first test if caching is allowed + // now see if there is a cache entry + + ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url()); + byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url()); + if (cachedResponse != null && content != null) { + // yes we have the content + + // create request header values and a response object because we need that + // in case that we want to return the cached content in the next step + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); + yacyURL refererURL = null; + if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash()); + if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true)); + Response response = new Response( + request, + requestHeader, + cachedResponse, + "200", + crawlProfile, + content); + + // check which caching strategy shall be used + if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) { + // well, just take the cache and don't care about freshness of the content + log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false)); + return response; + } + + // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test + assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy; + if (response.isFreshForProxy()) { + log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false)); + return response; + } else { + log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false)); + } + } + } - // check access time - if (!entry.url().isLocal()) { + // check case where we want results from the cache exclusively, and never from the internet (offline mode) + if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) { + // we had a chance to get the content from the cache .. its over. We don't have it. + return null; + } + + // now forget about the cache, nothing there. Try to load the content from the internet + + // check access time: this is a double-check (we checked possibly already in the balancer) + // to make shure that we don't DoS the target by mistake + if (!request.url().isLocal()) { final Long lastAccess = accessTime.get(host); long wait = 0; if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis()); @@ -91,13 +180,26 @@ public final class LoaderDispatcher { try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {} } } + + // now it's for shure that we will access the target. Remember the access time accessTime.put(host, System.currentTimeMillis()); - // load resource - if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry); - if (protocol.equals("ftp")) return ftpLoader.load(entry); + // load resource from the internet + Response response = null; + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request); + if (protocol.equals("ftp")) response = ftpLoader.load(request); + if (response != null) { + // we got something. Now check if we want to store that to the cache + String storeError = response.shallStoreCache(); + if (storeError == null) { + Cache.store(request.url(), response.getResponseHeader(), response.getContent()); + } else { + if (Cache.log.isFine()) Cache.log.logFine("no storage of url " + request.url() + ": " + storeError); + } + return response; + } - throw new IOException("Unsupported protocol '" + protocol + "' in url " + entry.url()); + throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url()); } public synchronized void cleanupAccessTimeTable(long timeout) { @@ -109,24 +211,4 @@ public final class LoaderDispatcher { if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove(); } } - - public String process(final Request entry) { - // load a resource, store it to htcache and push queue entry to switchboard queue - // returns null if everything went fine, a fail reason string if a problem occurred - Response h; - try { - entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING); - h = load(entry); - assert h != null; - entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); - final boolean stored = sb.htEntryStoreProcess(h); - entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); - return (stored) ? null : "not stored"; - } catch (IOException e) { - entry.setStatus("error", serverProcessorJob.STATUS_FINISHED); - if (log.isFine()) log.logFine("problem loading " + entry.url().toString() + ": " + e.getMessage()); - return "load error - " + e.getMessage(); - } - } - } \ No newline at end of file diff --git a/source/de/anomic/crawler/retrieval/Request.java b/source/de/anomic/crawler/retrieval/Request.java index c723bee3f..1da638b8a 100755 --- a/source/de/anomic/crawler/retrieval/Request.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -1,4 +1,4 @@ -// CrawlEntry.java +// Request.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 14.03.2007 on http://yacy.net // @@ -69,7 +69,7 @@ public class Request extends serverProcessorJob { private long loaddate; // the time when the url was loaded private long serverdate; // the document date from the target server private long imsdate; // the time of a ifModifiedSince request - private String profileHandle; // the name of the prefetch profile + private String profileHandle; // the name of the fetch profile private int depth; // the prefetch depth so far, starts at 0 private int anchors; // number of anchors of the parent private int forkfactor; // sum of anchors of all ancestors @@ -80,7 +80,7 @@ public class Request extends serverProcessorJob { /** - * A HarvestRequest Entry is a object that is created to provide + * A Request Entry is a object that is created to provide * all information to load a specific resource. * * @param initiator the hash of the initiator peer diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 37d9bb04f..747026e2a 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -145,7 +145,8 @@ public class Response { final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus, - final CrawlProfile.entry profile) { + final CrawlProfile.entry profile, + final byte[] content) { this.request = request; // request and response headers may be zero in case that we process surrogates this.requestHeader = requestHeader; @@ -153,15 +154,26 @@ public class Response { this.responseStatus = responseStatus; this.profile = profile; this.status = QUEUE_STATE_FRESH; - - // to be defined later: - this.content = null; + this.content = content; + } + + public Response( + Request request, + final RequestHeader requestHeader, + final ResponseHeader responseHeader, + final String responseStatus, + final CrawlProfile.entry profile) { + this(request, requestHeader, responseHeader, responseStatus, profile, null); } public void updateStatus(final int newStatus) { this.status = newStatus; } + public ResponseHeader getResponseHeader() { + return this.responseHeader; + } + public int getStatus() { return this.status; } @@ -241,7 +253,7 @@ public class Response { * @return NULL if the answer is TRUE, in case of FALSE, the reason as * String is returned */ - public String shallStoreCacheForProxy() { + public String shallStoreCache() { // check profile (disabled: we will check this in the plasmaSwitchboard) // if (!this.profile.storeHTCache()) { return "storage_not_wanted"; } @@ -252,7 +264,7 @@ public class Response { // check storage size: all files will be handled in RAM before storage, so they must not exceed // a given size, which we consider as 1MB - if (this.size() > 1024L * 1024L) return "too_large_for_caching_" + this.size(); + if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size(); // check status code if (!validResponseStatus()) { @@ -265,10 +277,15 @@ public class Response { if (this.url().isPOST() && !this.profile.crawlingQ()) { return "dynamic_post"; } + if (this.url().isCGI()) { return "dynamic_cgi"; } + if (this.url().isLocal()) { + return "local_URL_no_cache_needed"; + } + if (requestHeader != null) { // -authorization cases in request // authorization makes pages very individual, and therefore we cannot use the @@ -338,7 +355,7 @@ public class Response { * * @return whether the file should be taken from the cache */ - public boolean shallUseCacheForProxy() { + public boolean isFreshForProxy() { // -CGI access in request // CGI access makes the page very individual, and therefore not usable @@ -488,7 +505,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for proxy)"; + return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")"; } // -CGI access in request @@ -629,7 +646,7 @@ public class Response { // check profile if (!profile().indexText() && !profile().indexMedia()) { - return "indexing not allowed - indexText and indexMedia not set (for crawler)"; + return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name()+ ")"; } // -CGI access in request diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 2b2065365..6cc88b8ab 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -329,6 +329,7 @@ public class SitemapParser extends DefaultHandler { // remote Indexing disabled false, // exclude stop-words - true, true, true); + true, true, true, + CrawlProfile.CACHE_STRATEGY_IFFRESH); } } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index ae705e2c8..3f460f0a4 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -170,9 +170,17 @@ public class bookmarksDB { Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]) - ); + Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH + ); } + if (parser.length == 14) { + folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]), + Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), + Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), + Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), + Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13]) + ); + } } } in.close(); @@ -204,9 +212,9 @@ public class bookmarksDB { return true; } - public void folderReCrawl (long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, + public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, - boolean crawlOrder, boolean xsstopw, boolean storeHTCache) { + boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) { Switchboard sb = Switchboard.getSwitchboard(); Iterator bit=getBookmarksIterator(folder, true); @@ -261,7 +269,7 @@ public class bookmarksDB { sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); + storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy); sb.crawlStacker.enqueueEntry(new Request( sb.peers.mySeed().hash, crawlingStartURL, diff --git a/source/de/anomic/http/client/Cache.java b/source/de/anomic/http/client/Cache.java index ef3d8ef75..fe4cf02e6 100644 --- a/source/de/anomic/http/client/Cache.java +++ b/source/de/anomic/http/client/Cache.java @@ -63,7 +63,7 @@ public final class Cache { private static long maxCacheSize = 0l; private static File cachePath = null; private static String prefix; - private static final Log log = new Log("HTCACHE"); + public static final Log log = new Log("HTCACHE"); public static void init(final File htCachePath, String peerSalt, final long CacheSizeMax) { @@ -103,31 +103,39 @@ public final class Cache { fileDBunbuffered.setMaxSize(maxCacheSize); } + /** + * close the databases + */ public static void close() { responseHeaderDB.close(); fileDB.close(true); } - // Store to Cache - public static void storeMetadata(final yacyURL url, final ResponseHeader responseHeader) { - if (responseHeader != null) try { + public static void store(yacyURL url, final ResponseHeader responseHeader, byte[] file) { + if (responseHeader != null && file != null) try { // store the response header into the header database final HashMap hm = new HashMap(); hm.putAll(responseHeader); hm.put("@@URL", url.toNormalform(true, false)); responseHeaderDB.put(url.hash(), hm); - } catch (final Exception e) { - log.logWarning("could not write ResourceInfo: " - + e.getClass() + ": " + e.getMessage()); + fileDB.put(url.hash().getBytes("UTF-8"), file); + if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false)); + } catch (IOException e) { + e.printStackTrace(); } } - - public static void storeFile(yacyURL url, byte[] file) { + /** + * check if the responseHeaderDB and the fileDB has an entry for the given url + * @param url the url of the resource + * @return true if the content of the url is in the cache, false othervise + */ + public static boolean has(final yacyURL url) { try { - fileDB.put(url.hash().getBytes("UTF-8"), file); + return responseHeaderDB.has(url.hash()) && fileDB.has(url.hash().getBytes()); } catch (IOException e) { e.printStackTrace(); + return false; } } @@ -140,7 +148,7 @@ public final class Cache { * @throws UnsupportedProtocolException if the protocol is not supported and therefore the * info object couldn't be created */ - public static ResponseHeader loadResponseHeader(final yacyURL url) { + public static ResponseHeader getResponseHeader(final yacyURL url) { // loading data from database Map hdb; @@ -161,14 +169,21 @@ public final class Cache { * is available or the cached file is not readable, null * is returned. */ - public static InputStream getResourceContentStream(final yacyURL url) { + public static InputStream getContentStream(final yacyURL url) { // load the url as resource from the cache - byte[] b = getResourceContent(url); + byte[] b = getContent(url); if (b == null) return null; return new ByteArrayInputStream(b); } - public static byte[] getResourceContent(final yacyURL url) { + /** + * Returns the content of a cached resource as byte[] + * @param url the requested resource + * @return the resource content as byte[]. In no data + * is available or the cached file is not readable, null + * is returned. + */ + public static byte[] getContent(final yacyURL url) { // load the url as resource from the cache try { return fileDB.get(url.hash().getBytes("UTF-8")); @@ -178,8 +193,24 @@ public final class Cache { } } + /** + * requesting the content length of a resource is discouraged since it may + * be performed by loading of the resource from the cache and then measuring the + * size after decompression of the content. This may use a lot of CPU resources + * and maybe cause also high IO. Please omit usage of this method as much as possible. + * @param url + * @return the size of the cached content + */ public static long getResourceContentLength(final yacyURL url) { - // load the url as resource from the cache + // first try to get the length from the response header, + // this is less costly than loading the content from its gzipped cache + ResponseHeader responseHeader = getResponseHeader(url); + if (responseHeader != null) { + long length = responseHeader.getContentLength(); + if (length > 0) return length; + } + // load the url as resource from the cache (possibly decompress it), + // and get the length from the content array size try { return fileDB.length(url.hash().getBytes("UTF-8")); } catch (IOException e) { @@ -188,7 +219,12 @@ public final class Cache { } } - public static void deleteFromCache(yacyURL url) throws IOException { + /** + * removed response header and cached content from the database + * @param url + * @throws IOException + */ + public static void delete(yacyURL url) throws IOException { responseHeaderDB.remove(url.hash()); fileDB.remove(url.hash().getBytes("UTF-8")); } diff --git a/source/de/anomic/http/client/MultiOutputStream.java b/source/de/anomic/http/client/MultiOutputStream.java index 3b9ca89ba..7009454af 100644 --- a/source/de/anomic/http/client/MultiOutputStream.java +++ b/source/de/anomic/http/client/MultiOutputStream.java @@ -36,9 +36,24 @@ public class MultiOutputStream extends OutputStream { */ @Override public void write(int b) throws IOException { - for(OutputStream stream: streams) { + for (OutputStream stream: streams) { stream.write(b); } } + + /** + * writes the byte[] to each of the streams + * overriding this high-level method causes less overhead + * than overriding only the low-level write method: + * it causes (a large number) less 'for' loops + * + * @see java.io.OutputStream#write(int) + */ + @Override + public void write(byte[] b, int start, int len) throws IOException { + for (OutputStream stream: streams) { + stream.write(b, start, len); + } + } } diff --git a/source/de/anomic/http/server/HTTPDProxyHandler.java b/source/de/anomic/http/server/HTTPDProxyHandler.java index 950c2f3dc..4e30f4ef8 100644 --- a/source/de/anomic/http/server/HTTPDProxyHandler.java +++ b/source/de/anomic/http/server/HTTPDProxyHandler.java @@ -358,7 +358,7 @@ public final class HTTPDProxyHandler { // handle outgoing cookies handleOutgoingCookies(requestHeader, host, ip); prepareRequestHeader(conProp, requestHeader, hostlow); - ResponseHeader cachedResponseHeader = Cache.loadResponseHeader(url); + ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url); // why are files unzipped upon arrival? why not zip all files in cache? // This follows from the following premises @@ -404,10 +404,8 @@ public final class HTTPDProxyHandler { "200 OK", sb.crawler.defaultProxyProfile ); - //Cache.storeMetadata(cachedResponseHeader, response); // TODO: check if this storeMetadata is necessary - - byte[] cacheContent = Cache.getResourceContent(url); - if (cacheContent != null && response.shallUseCacheForProxy()) { + byte[] cacheContent = Cache.getContent(url); + if (cacheContent != null && response.isFreshForProxy()) { if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from cache"); fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond); } else { @@ -502,7 +500,7 @@ public final class HTTPDProxyHandler { if (cachedResponseHeader != null) { // delete the cache sizeBeforeDelete = Cache.getResourceContentLength(url); - Cache.deleteFromCache(url); + Cache.delete(url); conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); } @@ -518,14 +516,7 @@ public final class HTTPDProxyHandler { 0, 0, 0); - final Response response = new Response( - request, - requestHeader, - responseHeader, - res.getStatusLine(), - sb.crawler.defaultProxyProfile - ); - Cache.storeMetadata(request.url(), responseHeader); + // handle incoming cookies handleIncomingCookies(responseHeader, host, ip); @@ -549,8 +540,14 @@ public final class HTTPDProxyHandler { if (hasBody(res.getStatusCode())) { final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond); - - final String storeError = response.shallStoreCacheForProxy(); + final Response response = new Response( + request, + requestHeader, + responseHeader, + res.getStatusLine(), + sb.crawler.defaultProxyProfile + ); + final String storeError = response.shallStoreCache(); final boolean storeHTCache = response.profile().storeHTCache(); final String supportError = Parser.supports(response.url(), response.getMimeType()); if ( @@ -582,22 +579,21 @@ public final class HTTPDProxyHandler { if (sizeBeforeDelete == -1) { // totally fresh file - //cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert response.setContent(cacheArray); - sb.htEntryStoreProcess(response); - conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS"); + Cache.store(response.url(), response.getResponseHeader(), cacheArray); + sb.toIndexer(response); + conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS"); } else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) { // before we came here we deleted a cache entry cacheArray = null; - //cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD; //cacheManager.push(cacheEntry); // unnecessary update - conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT"); + conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT"); } else { // before we came here we deleted a cache entry - //cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD; response.setContent(cacheArray); - sb.htEntryStoreProcess(response); - conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS"); + Cache.store(response.url(), response.getResponseHeader(), cacheArray); + sb.toIndexer(response); + conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS"); } } else { // no caching diff --git a/source/de/anomic/kelondro/blob/Compressor.java b/source/de/anomic/kelondro/blob/Compressor.java index efe9d33cf..663c61be5 100644 --- a/source/de/anomic/kelondro/blob/Compressor.java +++ b/source/de/anomic/kelondro/blob/Compressor.java @@ -150,7 +150,7 @@ public class Compressor implements BLOB { return null; } } else if (ByteArray.equals(b, plainMagic)) { - System.out.print("-"); // DEBUG + //System.out.print("-"); // DEBUG byte[] r = new byte[b.length - 2]; System.arraycopy(b, 2, r, 0, b.length - 2); return r; diff --git a/source/de/anomic/search/SnippetCache.java b/source/de/anomic/search/SnippetCache.java index e01a67fe4..1611181ea 100644 --- a/source/de/anomic/search/SnippetCache.java +++ b/source/de/anomic/search/SnippetCache.java @@ -344,8 +344,8 @@ public class SnippetCache { return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash())); } else { // trying to load the resource from the cache - resContent = Cache.getResourceContentStream(url); - responseHeader = Cache.loadResponseHeader(url); + resContent = Cache.getContentStream(url); + responseHeader = Cache.getResponseHeader(url); if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) { // content may be too large to be parsed here. To be fast, we omit calculation of snippet here return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes"); @@ -353,12 +353,12 @@ public class SnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, true, reindexing); + final Response entry = Switchboard.getSwitchboard().loader.load(url, true, reindexing); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { - // place entry on crawl queue - sb.htEntryStoreProcess(entry); + // place entry on indexing queue + sb.toIndexer(entry); // read resource body (if it is there) final byte []resourceArray = entry.getContent(); @@ -366,7 +366,7 @@ public class SnippetCache { resContent = new ByteArrayInputStream(resourceArray); resContentLength = resourceArray.length; } else { - resContent = Cache.getResourceContentStream(url); + resContent = Cache.getContentStream(url); resContentLength = Cache.getResourceContentLength(url); } } @@ -456,8 +456,8 @@ public class SnippetCache { ResponseHeader responseHeader = null; try { // trying to load the resource from the cache - resContent = Cache.getResourceContentStream(url); - responseHeader = Cache.loadResponseHeader(url); + resContent = Cache.getContentStream(url); + responseHeader = Cache.getResponseHeader(url); if (resContent != null) { // if the content was found resContentLength = Cache.getResourceContentLength(url); @@ -465,7 +465,7 @@ public class SnippetCache { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, global); + final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { @@ -476,7 +476,7 @@ public class SnippetCache { resContent = new ByteArrayInputStream(resourceArray); resContentLength = resourceArray.length; } else { - resContent = Cache.getResourceContentStream(url); + resContent = Cache.getContentStream(url); resContentLength = Cache.getResourceContentLength(url); } } @@ -844,7 +844,7 @@ public class SnippetCache { if (responseHeader == null) { // try to get the header from the htcache directory try { - responseHeader = Cache.loadResponseHeader(url); + responseHeader = Cache.getResponseHeader(url); } catch (final Exception e) { // ignore this. resource info loading failed } @@ -897,14 +897,14 @@ public class SnippetCache { long contentLength = -1; // trying to load the resource body from cache - InputStream resource = Cache.getResourceContentStream(url); + InputStream resource = Cache.getContentStream(url); if (resource != null) { contentLength = Cache.getResourceContentLength(url); } else if (fetchOnline) { // if the content is not available in cache try to download it from web // try to download the resource using a crawler - final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, reindexing); + final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, reindexing); if (entry == null) return null; // not found in web // read resource body (if it is there) @@ -912,7 +912,7 @@ public class SnippetCache { // in case that the resource was not in ram, read it from disk if (resourceArray == null) { - resource = Cache.getResourceContentStream(url); + resource = Cache.getContentStream(url); contentLength = Cache.getResourceContentLength(url); } else { resource = new ByteArrayInputStream(resourceArray); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 9d688990e..f071423b1 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -230,6 +230,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi public File surrogatesOutPath; public Map rankingPermissions; public Segment indexSegment; + public LoaderDispatcher loader; public CrawlSwitchboard crawler; public CrawlQueues crawlQueues; public ResultURLs crawlResults; @@ -514,6 +515,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // start a loader log.logConfig("Starting Crawl Loader"); + this.loader = new LoaderDispatcher(this); this.crawlQueues = new CrawlQueues(this, queuesRoot); this.crawlQueues.noticeURL.setMinimumDelta( this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()), @@ -1092,90 +1094,6 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi return this.crawler.cleanProfiles(); } - public boolean htEntryStoreProcess(final Response entry) { - - if (entry == null) return false; - - /* ========================================================================= - * PARSER SUPPORT - * - * Testing if the content type is supported by the available parsers - * ========================================================================= */ - final String supportError = Parser.supports(entry.url(), entry.getMimeType()); - if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() + " is supported: " + supportError); - - /* ========================================================================= - * INDEX CONTROL HEADER - * - * With the X-YACY-Index-Control header set to "no-index" a client could disallow - * yacy to index the response returned as answer to a request - * ========================================================================= */ - boolean doIndexing = true; - if (entry.requestProhibitsIndexing()) { - doIndexing = false; - if (this.log.isFine()) this.log.logFine("Crawling of " + entry.url() + " prohibited by request."); - } - - /* ========================================================================= - * LOCAL IP ADDRESS CHECK - * - * check if ip is local ip address // TODO: remove this procotol specific code here - * ========================================================================= */ - final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url()); - if (urlRejectReason != null) { - if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason); - doIndexing = false; - } - - /* ========================================================================= - * STORING DATA - * - * Now we store the response header and response content if - * a) the user has configured to use the htcache or - * b) the content should be indexed - * ========================================================================= */ - if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && supportError == null)) { - // store response header - /* - if (entry.writeResourceInfo()) { - this.log.logInfo("WROTE HEADER for " + entry.cacheFile()); - } - */ - - // work off unwritten files - if (entry.getContent() != null) { - final String error = (entry.initiator() == null) ? entry.shallStoreCacheForProxy() : null; - if (error == null) { - Cache.storeFile(entry.url(), entry.getContent()); - if (this.log.isFine()) this.log.logFine("WROTE FILE (" + entry.getContent().length + " bytes) for " + entry.url()); - } else { - if (this.log.isWarning()) this.log.logWarning("WRITE OF FILE " + entry.url() + " FORBIDDEN: " + error); - } - //} else { - //this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile); - } - } - - /* ========================================================================= - * INDEXING - * ========================================================================= */ - if (doIndexing && supportError == null) { - - // enqueue for further crawling - enQueue(entry); - } else { - if (!entry.profile().storeHTCache()) { - try { - Cache.deleteFromCache(entry.url()); - } catch (IOException e) { - e.printStackTrace(); - } - } - } - - return true; - } - public void close() { log.logConfig("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:"); serverProfiling.stopSystemProfiling(); @@ -1215,44 +1133,65 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED"); } - public void enQueue(final Response queueEntry) { - assert queueEntry != null; + public boolean toIndexer(final Response response) { + assert response != null; // get next queue entry and start a queue processing - if (queueEntry == null) { + if (response == null) { if (this.log.isFine()) log.logFine("deQueue: queue entry is null"); - return; + return false; } - if (queueEntry.profile() == null) { + if (response.profile() == null) { if (this.log.isFine()) log.logFine("deQueue: profile is null"); - return; + return false; } - // check if the document should be indexed + // check if the document should be indexed based on proxy/crawler rules String noIndexReason = "unspecified indexing error"; - if (queueEntry.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) { + if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) { // proxy-load - noIndexReason = queueEntry.shallIndexCacheForProxy(); + noIndexReason = response.shallIndexCacheForProxy(); } else { // normal crawling - noIndexReason = queueEntry.shallIndexCacheForCrawler(); + noIndexReason = response.shallIndexCacheForCrawler(); + } + + // check if the parser supports the mime type + if (noIndexReason == null) { + noIndexReason = Parser.supports(response.url(), response.getMimeType()); } + + // check X-YACY-Index-Control + // With the X-YACY-Index-Control header set to "no-index" a client could disallow + // yacy to index the response returned as answer to a request + if (noIndexReason == null && response.requestProhibitsIndexing()) { + noIndexReason = "X-YACY-Index-Control header prohibits indexing"; + } + + // check accepted domain / localhost accesses + if (noIndexReason == null) { + noIndexReason = crawlStacker.urlInAcceptedDomain(response.url()); + } + + // in the noIndexReason is set, indexing is not allowed if (noIndexReason != null) { - // this document should not be indexed. log cause and close queue - final yacyURL referrerURL = queueEntry.referrerURL(); - if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + queueEntry.url() + "; cause: " + noIndexReason); - addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), queueEntry.name(), noIndexReason); + // log cause and close queue + final yacyURL referrerURL = response.referrerURL(); + if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason); + addURLtoErrorDB(response.url(), (referrerURL == null) ? "" : referrerURL.hash(), response.initiator(), response.name(), noIndexReason); // finish this entry - return; + return false; } // put document into the concurrent processing queue - if (log.isFinest()) log.logFinest("deQueue: passing entry to indexing queue"); + if (log.isFinest()) log.logFinest("deQueue: passing to indexing queue: " + response.url().toNormalform(true, false)); try { - this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(queueEntry, null, null)); + this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(response, null, null)); + return true; } catch (InterruptedException e) { e.printStackTrace(); + return false; } } @@ -1649,7 +1588,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi try { // parse the document - document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getResourceContent(entry.url())); + document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getContent(entry.url())); assert(document != null) : "Unexpected error. Parser returned null."; } catch (final ParserException e) { this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage()); diff --git a/source/de/anomic/ymage/ymageOSM.java b/source/de/anomic/ymage/ymageOSM.java index f5acda301..a3dd2bd4a 100644 --- a/source/de/anomic/ymage/ymageOSM.java +++ b/source/de/anomic/ymage/ymageOSM.java @@ -77,12 +77,12 @@ public class ymageOSM { return null; } System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true)); - InputStream tileStream = Cache.getResourceContentStream(tileURL); + InputStream tileStream = Cache.getContentStream(tileURL); if (tileStream == null) { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { - entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, false, false); + entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false); } catch (IOException e) { Log.logWarning("yamyOSM", "cannot load: " + e.getMessage()); return null;