From 7bcfa033c93da424d5f535bc1a62bdb1a0990602 Mon Sep 17 00:00:00 2001 From: orbiter Date: Mon, 21 Jun 2010 14:54:54 +0000 Subject: [PATCH] more abstraction of the htcache when using the LoaderDispatcher: a cache access shall not made directly to the cache any more, all loading attempts shall use the LoaderDispatcher. To control the usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be used. Some direct loading methods without the usage of a cache strategy have been removed. This affects also the verify-option of the yacysearch servlet. If there is a 'verify=false' now after this commit this does not necessarily mean that no snippets are generated. Instead, all snippets that can be retrieved using the cache only are presented. This still means that the search hit was not verified because the snippet was generated using the cache. If a cache-based generation of snippets is not possible, then the verify=false causes that the link is not rejected. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 3 +- htroot/IndexControlRWIs_p.java | 5 +- htroot/IndexControlURLs_p.java | 3 +- htroot/RSSLoader_p.java | 3 +- htroot/ViewFile.java | 84 ++++--------------- htroot/ViewImage.java | 3 +- htroot/yacy/search.java | 5 +- htroot/yacysearch.java | 27 +++--- source/de/anomic/crawler/CrawlProfile.java | 16 ++++ source/de/anomic/crawler/CrawlQueues.java | 3 +- source/de/anomic/search/MediaSnippet.java | 5 +- source/de/anomic/search/QueryParams.java | 11 +-- source/de/anomic/search/ResultFetcher.java | 29 +++---- source/de/anomic/search/Segment.java | 25 ++++-- source/de/anomic/search/TextSnippet.java | 20 +++-- source/de/anomic/server/serverCore.java | 2 +- .../net/yacy/document/parser/pdfParser.java | 10 ++- .../net/yacy/repository/LoaderDispatcher.java | 76 +++++------------ 18 files changed, 142 insertions(+), 188 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index b99969e37..f59b54b34 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -43,6 +43,7 @@ import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.BookmarkHelper; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; @@ -188,7 +189,7 @@ public class Bookmarks { Document document = null; if (urlentry != null) { final URIMetadataRow.Components metadata = urlentry.metadata(); - document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); + document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.putHTML("mode_title", metadata.dc_title()); diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index 8ad3555d3..b5763daa2 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -52,6 +52,7 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.Blacklist; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.listManager; import de.anomic.http.server.RequestHeader; import de.anomic.search.QueryParams; @@ -162,7 +163,7 @@ public class IndexControlRWIs_p { index = null; } if (delurlref) { - segment.removeAllUrlReferences(urlb, sb.loader, true); + segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST); } // delete the word first because that is much faster than the deletion of the urls from the url database segment.termIndex().delete(keyhash); @@ -179,7 +180,7 @@ public class IndexControlRWIs_p { // delete selected URLs if (post.containsKey("keyhashdelete")) try { if (delurlref) { - segment.removeAllUrlReferences(urlb, sb.loader, true); + segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST); } if (delurl || delurlref) { for (byte[] b: urlb) sb.urlRemove(segment, b); diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index f8d8f3a89..1bed53899 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -38,6 +38,7 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.RotateIterator; import net.yacy.kelondro.util.DateFormatter; +import de.anomic.crawler.CrawlProfile; import de.anomic.http.server.RequestHeader; import de.anomic.search.MetadataRepository; import de.anomic.search.Segment; @@ -140,7 +141,7 @@ public class IndexControlURLs_p { prop.put("result", " "); if (post.containsKey("urlhashdeleteall")) { - i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, true); + i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CrawlProfile.CacheStrategy.IFEXIST); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("lurlexport", 0); prop.put("reload", 0); diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java index 159300f69..295715492 100644 --- a/htroot/RSSLoader_p.java +++ b/htroot/RSSLoader_p.java @@ -33,6 +33,7 @@ import net.yacy.document.ParserException; import net.yacy.document.parser.rssParser; import net.yacy.kelondro.data.meta.DigestURI; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; @@ -63,7 +64,7 @@ public class RSSLoader_p { // if the resource body was not cached we try to load it from web Response entry = null; try { - entry = sb.loader.load(url, true, false, Long.MAX_VALUE); + entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); } catch (final Exception e) { return prop; } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index cb3bd045e..a2fba71d1 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -43,9 +43,9 @@ import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.logging.Log; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; import de.anomic.http.server.RequestHeader; @@ -150,7 +150,7 @@ public class ViewFile { } // define an url by post parameter - url = new DigestURI(urlString, null); + url = new DigestURI(MultiProtocolURI.unescape(urlString), null); urlHash = new String(url.hash()); pre = post.get("pre", "false").equals("true"); } catch (final MalformedURLException e) {} @@ -168,87 +168,35 @@ public class ViewFile { // loading the resource content as byte array prop.put("error_incache", Cache.has(url) ? 1 : 0); - ResponseHeader responseHeader = null; String resMime = null; + ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);; byte[] resource = Cache.getContent(url); - if (resource == null && authorized) { + if ((resource == null || responseHeader == null) && authorized) { // load resource from net Response response = null; try { - response = sb.loader.load(url, true, false, Long.MAX_VALUE); + response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE); } catch (IOException e) { - Log.logException(e); - } - if (response != null) { - resource = response.getContent(); - responseHeader = response.getResponseHeader(); - } - } - - if (responseHeader == null) responseHeader = Cache.getResponseHeader(url); - - // if the resource body was not cached we try to load it from web - if (resource == null) { - Response entry = null; - try { - entry = sb.loader.load(url, true, false, Long.MAX_VALUE); - } catch (final Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); prop.put("viewMode", VIEW_MODE_NO_TEXT); return prop; } - - if (entry != null) { - resource = entry.getContent(); - } - - if (resource == null) { - prop.put("error", "4"); - prop.put("error_errorText", "No resource available"); - prop.put("viewMode", VIEW_MODE_NO_TEXT); - return prop; + if (response != null) { + resource = response.getContent(); + responseHeader = response.getResponseHeader(); } } - - // try to load resource metadata - if (responseHeader == null) { - - // try to load the metadata from cache - try { - responseHeader = Cache.getResponseHeader(url); - } catch (final Exception e) { - /* ignore this */ - } - - // if the metadata was not cached try to load it from web - if (responseHeader == null) { - final String protocol = url.getProtocol(); - if (!((protocol.equals("http") || protocol.equals("https")))) { - prop.put("error", "6"); - prop.put("viewMode", VIEW_MODE_NO_TEXT); - return prop; - } - - try { - Response response = sb.loader.load(url, true, false, Long.MAX_VALUE); - responseHeader = response.getResponseHeader(); - resource = response.getContent(); - } catch (IOException e) { - Log.logException(e); - } - if (responseHeader == null) { - prop.put("error", "4"); - prop.put("error_errorText", "Unable to load resource metadata."); - prop.put("viewMode", VIEW_MODE_NO_TEXT); - return prop; - } - resMime = responseHeader.mime(); - } - } else { - resMime = responseHeader.mime(); + + // if resource not available just fail + if (resource == null || responseHeader == null) { + prop.put("error", "4"); + prop.put("error_errorText", "No resource available"); + prop.put("viewMode", VIEW_MODE_NO_TEXT); + return prop; } + resMime = responseHeader.mime(); final String[] wordArray = wordArray(post.get("words", null)); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index dbf42e51d..92e62c2b3 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -35,6 +35,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; +import de.anomic.crawler.CrawlProfile; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.search.Switchboard; @@ -90,7 +91,7 @@ public class ViewImage { if (scaled == null) { byte[] resourceb = null; if (url != null) try { - resourceb = sb.loader.getResource(url, true, timeout, false, true); + resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true); } catch (IOException e) { Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 0167692ba..54667a061 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -47,6 +47,7 @@ import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.ISO639; +import de.anomic.crawler.CrawlProfile; import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.RequestHeader; import de.anomic.net.natLib; @@ -206,7 +207,7 @@ public final class search { ContentDomain.contentdomParser(contentdom), language, "", // no navigation - false, + CrawlProfile.CacheStrategy.CACHEONLY, count, 0, filter, @@ -259,7 +260,7 @@ public final class search { ContentDomain.contentdomParser(contentdom), language, "", // no navigation - false, + CrawlProfile.CacheStrategy.CACHEONLY, count, 0, filter, diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 003f0f486..a6b87c30c 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -51,6 +51,7 @@ import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.ISO639; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.DidYouMean; import de.anomic.data.LibraryProvider; import de.anomic.http.server.HeaderFramework; @@ -67,7 +68,6 @@ import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -import de.anomic.yacy.yacyNewsDB; import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.graphics.ProfilingGraph; @@ -97,7 +97,8 @@ public class yacysearch { // get query String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); String querystring = originalquerystring.replace('+', ' '); - boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); + CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly")); + if (snippetFetchStrategy == null) snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; final serverObjects prop = new serverObjects(); // get segment @@ -164,7 +165,7 @@ public class yacysearch { // collect search attributes boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term - int itemsPerPage = Math.min((authenticated) ? (fetchSnippets ? 100 : 1000) : (fetchSnippets ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative + int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0)); boolean global = post.get("resource", "local").equals("global"); @@ -228,12 +229,12 @@ public class yacysearch { boolean block = false; if (Domains.matchesList(client, sb.networkBlacklist)) { global = false; - fetchSnippets = false; + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; block = true; Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search"); } else if (Domains.matchesList(client, sb.networkWhitelist)) { Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions"); - } else if (global || fetchSnippets) { + } else if (global || snippetFetchStrategy.isAllowedToFetchOnline()) { // in case that we do a global search or we want to fetch snippets, we check for DoS cases synchronized (trackerHandles) { int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size(); @@ -242,21 +243,21 @@ public class yacysearch { int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size(); if (accInTenMinutes > 600) { global = false; - fetchSnippets = false; + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; block = true; Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)"); } else if (accInOneMinute > 200) { global = false; - fetchSnippets = false; + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; block = true; Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)"); } else if (accInThreeSeconds > 1) { global = false; - fetchSnippets = false; + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets"); } else if (accInOneSecond > 2) { global = false; - fetchSnippets = false; + snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY; Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets"); } } @@ -428,7 +429,7 @@ public class yacysearch { if (urlentry != null) { final URIMetadataRow.Components metadata = urlentry.metadata(); Document document; - document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); + document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE); if (document != null) { // create a news message final HashMap map = new HashMap(); @@ -460,7 +461,7 @@ public class yacysearch { contentdom, language, navigation, - fetchSnippets, + snippetFetchStrategy, itemsPerPage, offset, urlmask, @@ -538,7 +539,7 @@ public class yacysearch { "&maximumRecords="+ theQuery.displayResults() + "&startRecord=" + (0 * theQuery.displayResults()) + "&resource=" + ((theQuery.isLocal()) ? "local" : "global") + - "&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") + + "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") + "&nav=" + theQuery.navigators + "&urlmaskfilter=" + originalUrlMask.toString() + "&prefermaskfilter=" + theQuery.prefer.toString() + @@ -684,7 +685,7 @@ public class yacysearch { prop.putHTML("prefermaskfilter", prefermask); prop.put("indexof", (indexof) ? "on" : "off"); prop.put("constraint", (constraint == null) ? "" : constraint.exportB64()); - prop.put("verify", (fetchSnippets) ? "true" : "false"); + prop.put("verify", snippetFetchStrategy.toName()); prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text"))); prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0); prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 99cb89562..6d3b84174 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -259,6 +259,22 @@ public class CrawlProfile { for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy; return NOCACHE; } + public static CacheStrategy parse(String name) { + if (name.equals("nocache")) return NOCACHE; + if (name.equals("iffresh")) return IFFRESH; + if (name.equals("ifexist")) return IFEXIST; + if (name.equals("cacheonly")) return CACHEONLY; + return null; + } + public String toName() { + return this.name().toLowerCase(); + } + public boolean isAllowedToFetchOnline() { + return this.code < 3; + } + public boolean mustBeOffline() { + return this.code == 3; + } } public static class entry { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 827db3285..172454036 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -563,7 +563,8 @@ public class CrawlQueues { try { request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - Response response = sb.loader.load(request, maxFileSize); + CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); + Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index ada9138cc..0cbd5ddcd 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -30,6 +30,7 @@ import java.util.Iterator; import java.util.Map; import java.util.TreeSet; +import de.anomic.crawler.CrawlProfile; import de.anomic.data.MimeTable; import net.yacy.cora.document.MultiProtocolURI; @@ -111,13 +112,13 @@ public class MediaSnippet implements Comparable, Comparator retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) { + public static ArrayList retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean reindexing) { if (queryhashes.isEmpty()) { Log.logFine("snippet fetch", "no query hashes given for url " + url); return new ArrayList(); } - final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE); + final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE); final ArrayList a = new ArrayList(); if (document != null) { if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO)); diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 1c1826ff3..4ab21a73e 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -44,6 +44,7 @@ import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.SetTools; +import de.anomic.crawler.CrawlProfile; import de.anomic.yacy.yacySeed; public final class QueryParams { @@ -79,7 +80,7 @@ public final class QueryParams { public final int maxDistance; public final Bitfield constraint; public final boolean allofconstraint; - public final boolean onlineSnippetFetch; + public final CrawlProfile.CacheStrategy snippetCacheStrategy; public final RankingProfile ranking; private final Segment indexSegment; public final String host; // this is the client host that starts the query, not a site operator @@ -130,7 +131,7 @@ public final class QueryParams { this.domMaxTargets = 0; this.constraint = constraint; this.allofconstraint = false; - this.onlineSnippetFetch = false; + this.snippetCacheStrategy = CrawlProfile.CacheStrategy.CACHEONLY; this.host = null; this.sitehash = null; this.authorhash = null; @@ -149,7 +150,7 @@ public final class QueryParams { final int maxDistance, final String prefer, final ContentDomain contentdom, final String language, final String navigators, - final boolean onlineSnippetFetch, + final CrawlProfile.CacheStrategy snippetCacheStrategy, final int itemsPerPage, final int offset, final String urlMask, final int domType, final int domMaxTargets, final Bitfield constraint, final boolean allofconstraint, @@ -184,7 +185,7 @@ public final class QueryParams { this.allofconstraint = allofconstraint; this.sitehash = site; assert site == null || site.length() == 6; this.authorhash = authorhash; assert authorhash == null || authorhash.length() > 0; - this.onlineSnippetFetch = onlineSnippetFetch; + this.snippetCacheStrategy = snippetCacheStrategy; this.host = host; this.remotepeer = null; this.handle = Long.valueOf(System.currentTimeMillis()); @@ -375,7 +376,7 @@ public final class QueryParams { "&maximumRecords="+ theQuery.displayResults() + "&startRecord=" + (page * theQuery.displayResults()) + "&resource=" + ((theQuery.isLocal()) ? "local" : "global") + - "&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") + + "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") + "&nav=" + nav + "&urlmaskfilter=" + originalUrlMask + "&prefermaskfilter=" + theQuery.prefer + diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 228693432..622bc9aeb 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -42,6 +42,7 @@ import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.SortStore; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.search.MediaSnippet; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.graphics.ProfilingGraph; @@ -105,9 +106,9 @@ public class ResultFetcher { public void deployWorker(int deployCount, int neededResults) { if (anyWorkerAlive()) return; - this.workerThreads = new Worker[(query.onlineSnippetFetch) ? deployCount : 1]; + this.workerThreads = new Worker[(query.snippetCacheStrategy.isAllowedToFetchOnline()) ? deployCount : 1]; for (int i = 0; i < workerThreads.length; i++) { - this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0, neededResults); + this.workerThreads[i] = new Worker(i, 10000, query.snippetCacheStrategy, neededResults); this.workerThreads[i].start(); } } @@ -135,12 +136,12 @@ public class ResultFetcher { private final long timeout; // the date until this thread should try to work private long lastLifeSign; // when the last time the run()-loop was executed private final int id; - private final int snippetMode; + private final CrawlProfile.CacheStrategy cacheStrategy; private final int neededResults; - public Worker(final int id, final long maxlifetime, int snippetMode, int neededResults) { + public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, int neededResults) { this.id = id; - this.snippetMode = snippetMode; + this.cacheStrategy = cacheStrategy; this.lastLifeSign = System.currentTimeMillis(); this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.neededResults = neededResults; @@ -166,7 +167,7 @@ public class ResultFetcher { if (page == null) break; if (failedURLs.has(page.hash())) continue; - final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0 + final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0 if (resultEntry == null) continue; // the entry had some problems, cannot be used if (result.exists(resultEntry)) continue; @@ -195,7 +196,7 @@ public class ResultFetcher { } } - protected ResultEntry fetchSnippet(final URIMetadataRow page, final int snippetMode) { + protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) { // Snippet Fetching can has 3 modes: // 0 - do not fetch snippets // 1 - fetch snippets offline only @@ -209,7 +210,7 @@ public class ResultFetcher { if (metadata == null) return null; final long dbRetrievalTime = System.currentTimeMillis() - startTime; - if (snippetMode == 0) { + if (cacheStrategy == null) { return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet } @@ -221,10 +222,10 @@ public class ResultFetcher { this.loader, metadata, snippetFetchWordHashes, - (snippetMode == 2), + cacheStrategy, ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), 180, - (snippetMode == 2) ? Integer.MAX_VALUE : 30000, + Integer.MAX_VALUE, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); @@ -232,26 +233,26 @@ public class ResultFetcher { if (snippet.getErrorCode() < 11) { // we loaded the file and found the snippet return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached - } else if (snippetMode == 1) { + } else if (cacheStrategy.mustBeOffline()) { // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result // this may happen during a remote search, because snippet loading is omitted to retrieve results faster return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet } else { // problems with snippet fetch - registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); + registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode()); return null; } } else { // attach media information startTime = System.currentTimeMillis(); - final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal()); + final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); if (mediaSnippets != null && !mediaSnippets.isEmpty()) { // found media snippets, return entry return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime); - } else if (snippetMode == 1) { + } else if (cacheStrategy.mustBeOffline()) { return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); } else { // problems with snippet fetch diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 340395716..ac3e78145 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -62,6 +62,7 @@ import net.yacy.kelondro.util.ISO639; import net.yacy.repository.Blacklist; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; public class Segment { @@ -360,18 +361,24 @@ public class Segment { // method for index deletion - public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) { - return removeAllUrlReferences(url.hash(), loader, fetchOnline); + public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) { + return removeAllUrlReferences(url.hash(), loader, cacheStrategy); } - public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final boolean fetchOnline) { - for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, fetchOnline); + public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) { + for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy); } - public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final boolean fetchOnline) { - // find all the words in a specific resource and remove the url reference from every word index - // finally, delete the url entry - + /** + * find all the words in a specific resource and remove the url reference from every word index + * finally, delete the url entry + * @param urlhash the hash of the url that shall be removed + * @param loader + * @param cacheStrategy + * @return number of removed words + */ + public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) { + if (urlhash == null) return 0; // determine the url string final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0); @@ -384,7 +391,7 @@ public class Segment { // get the resource content byte[] resourceb = null; try { - resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); + resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false); } catch (IOException e) { Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); } diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index f1f70ac15..091ba4b52 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -48,6 +48,7 @@ import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.util.ByteArray; import net.yacy.repository.LoaderDispatcher; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Cache; import de.anomic.http.server.ResponseHeader; @@ -308,7 +309,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator, Comparator) this.supportedProtocols.clone(); } - public Response load( - final DigestURI url, - final boolean forText, - final boolean global, - final long maxFileSize) throws IOException { - return load(request(url, forText, global), maxFileSize); - } - /** * load a resource from the web, from ftp, from smb or a file * @param url - * @param forText - * @param global + * @param forText shows that this was a for-text crawling request + * @param global shows that this was a global crawling request * @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY * @return the loaded entity in a Response object * @throws IOException @@ -169,13 +161,6 @@ public final class LoaderDispatcher { 0); } - public Response load(final Request request, long maxFileSize) throws IOException { - CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); - CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST; - if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); - return load(request, cacheStrategy, maxFileSize); - } - public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); @@ -295,15 +280,10 @@ public final class LoaderDispatcher { * @return the content as {@link byte[]} * @throws IOException */ - public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { - byte[] resource = Cache.getContent(url); - if (resource != null) return resource; - - if (!fetchOnline) return null; - + public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { // try to download the resource using the loader final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); - final Response entry = load(url, forText, reindexing, maxFileSize); + final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize); if (entry == null) return null; // not found in web // read resource body (if it is there) @@ -322,45 +302,27 @@ public final class LoaderDispatcher { * @param global the domain of the search. If global == true then the content is re-indexed * @return the parsed document as {@link Document} */ - public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) { + public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) { // load resource byte[] resContent = null; ResponseHeader responseHeader = null; try { - // trying to load the resource from the cache - resContent = Cache.getContent(url); - responseHeader = Cache.getResponseHeader(url); - if (resContent != null) { - // if the content was found - } else if (fetchOnline) { - // if not found try to download it - - // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize); - - // getting resource metadata (e.g. the http headers for http resources) - if (entry != null) { + final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize); + if (entry == null) { + Log.logFine("snippet fetch", "no Response for url " + url); + return null; + } - // read resource body (if it is there) - final byte[] resourceArray = entry.getContent(); - if (resourceArray != null) { - resContent = resourceArray; - } else { - resContent = Cache.getContent(url); - } - - // read a fresh header - responseHeader = entry.getResponseHeader(); - } - - // if it is still not available, report an error - if (resContent == null) { - Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url); - return null; - } - } else { - Log.logFine("snippet fetch", "no resource available for url " + url); + // read resource body (if it is there) + resContent = entry.getContent(); + + // read a fresh header + responseHeader = entry.getResponseHeader(); + + // if it is still not available, report an error + if (resContent == null || responseHeader == null) { + Log.logFine("snippet fetch", "no Content available for url " + url); return null; } } catch (final Exception e) {