more abstraction of the htcache when using the LoaderDispatcher:

a cache access shall not made directly to the cache any more, all loading attempts shall use the LoaderDispatcher.
To control the usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be used.
Some direct loading methods without the usage of a cache strategy have been removed. This affects also the verify-option
of the yacysearch servlet. If there is a 'verify=false' now after this commit this does not necessarily mean that no snippets
are generated. Instead, all snippets that can be retrieved using the cache only are presented. This still means that the search hit was not verified because the snippet was generated using the cache. If a cache-based generation of snippets is not possible, then the verify=false causes that the link is not rejected.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent fd9f0714a3
commit 7bcfa033c9

@ -43,6 +43,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.BookmarkHelper; import de.anomic.data.BookmarkHelper;
import de.anomic.data.bookmarksDB; import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager; import de.anomic.data.listManager;
@ -188,7 +189,7 @@ public class Bookmarks {
Document document = null; Document document = null;
if (urlentry != null) { if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata(); final URIMetadataRow.Components metadata = urlentry.metadata();
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
prop.put("mode_edit", "0"); // create mode prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title()); prop.putHTML("mode_title", metadata.dc_title());

@ -52,6 +52,7 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.listManager; import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams; import de.anomic.search.QueryParams;
@ -162,7 +163,7 @@ public class IndexControlRWIs_p {
index = null; index = null;
} }
if (delurlref) { if (delurlref) {
segment.removeAllUrlReferences(urlb, sb.loader, true); segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
} }
// delete the word first because that is much faster than the deletion of the urls from the url database // delete the word first because that is much faster than the deletion of the urls from the url database
segment.termIndex().delete(keyhash); segment.termIndex().delete(keyhash);
@ -179,7 +180,7 @@ public class IndexControlRWIs_p {
// delete selected URLs // delete selected URLs
if (post.containsKey("keyhashdelete")) try { if (post.containsKey("keyhashdelete")) try {
if (delurlref) { if (delurlref) {
segment.removeAllUrlReferences(urlb, sb.loader, true); segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (byte[] b: urlb) sb.urlRemove(segment, b); for (byte[] b: urlb) sb.urlRemove(segment, b);

@ -38,6 +38,7 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.RotateIterator; import net.yacy.kelondro.order.RotateIterator;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.MetadataRepository; import de.anomic.search.MetadataRepository;
import de.anomic.search.Segment; import de.anomic.search.Segment;
@ -140,7 +141,7 @@ public class IndexControlURLs_p {
prop.put("result", " "); prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) { if (post.containsKey("urlhashdeleteall")) {
i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, true); i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0); prop.put("lurlexport", 0);
prop.put("reload", 0); prop.put("reload", 0);

@ -33,6 +33,7 @@ import net.yacy.document.ParserException;
import net.yacy.document.parser.rssParser; import net.yacy.document.parser.rssParser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -63,7 +64,7 @@ public class RSSLoader_p {
// if the resource body was not cached we try to load it from web // if the resource body was not cached we try to load it from web
Response entry = null; Response entry = null;
try { try {
entry = sb.loader.load(url, true, false, Long.MAX_VALUE); entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
} catch (final Exception e) { } catch (final Exception e) {
return prop; return prop;
} }

@ -43,9 +43,9 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache; import de.anomic.http.client.Cache;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
@ -150,7 +150,7 @@ public class ViewFile {
} }
// define an url by post parameter // define an url by post parameter
url = new DigestURI(urlString, null); url = new DigestURI(MultiProtocolURI.unescape(urlString), null);
urlHash = new String(url.hash()); urlHash = new String(url.hash());
pre = post.get("pre", "false").equals("true"); pre = post.get("pre", "false").equals("true");
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
@ -168,87 +168,35 @@ public class ViewFile {
// loading the resource content as byte array // loading the resource content as byte array
prop.put("error_incache", Cache.has(url) ? 1 : 0); prop.put("error_incache", Cache.has(url) ? 1 : 0);
ResponseHeader responseHeader = null;
String resMime = null; String resMime = null;
ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);;
byte[] resource = Cache.getContent(url); byte[] resource = Cache.getContent(url);
if (resource == null && authorized) { if ((resource == null || responseHeader == null) && authorized) {
// load resource from net // load resource from net
Response response = null; Response response = null;
try { try {
response = sb.loader.load(url, true, false, Long.MAX_VALUE); response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
} catch (IOException e) { } catch (IOException e) {
Log.logException(e);
}
if (response != null) {
resource = response.getContent();
responseHeader = response.getResponseHeader();
}
}
if (responseHeader == null) responseHeader = Cache.getResponseHeader(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
Response entry = null;
try {
entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
prop.put("error", "4"); prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage()); prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT); prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop; return prop;
} }
if (response != null) {
if (entry != null) { resource = response.getContent();
resource = entry.getContent(); responseHeader = response.getResponseHeader();
}
if (resource == null) {
prop.put("error", "4");
prop.put("error_errorText", "No resource available");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
} }
} }
// try to load resource metadata // if resource not available just fail
if (responseHeader == null) { if (resource == null || responseHeader == null) {
prop.put("error", "4");
// try to load the metadata from cache prop.put("error_errorText", "No resource available");
try { prop.put("viewMode", VIEW_MODE_NO_TEXT);
responseHeader = Cache.getResponseHeader(url); return prop;
} catch (final Exception e) {
/* ignore this */
}
// if the metadata was not cached try to load it from web
if (responseHeader == null) {
final String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error", "6");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
try {
Response response = sb.loader.load(url, true, false, Long.MAX_VALUE);
responseHeader = response.getResponseHeader();
resource = response.getContent();
} catch (IOException e) {
Log.logException(e);
}
if (responseHeader == null) {
prop.put("error", "4");
prop.put("error_errorText", "Unable to load resource metadata.");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
}
} else {
resMime = responseHeader.mime();
} }
resMime = responseHeader.mime();
final String[] wordArray = wordArray(post.get("words", null)); final String[] wordArray = wordArray(post.get("words", null));

@ -35,6 +35,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
@ -90,7 +91,7 @@ public class ViewImage {
if (scaled == null) { if (scaled == null) {
byte[] resourceb = null; byte[] resourceb = null;
if (url != null) try { if (url != null) try {
resourceb = sb.loader.getResource(url, true, timeout, false, true); resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true);
} catch (IOException e) { } catch (IOException e) {
Log.logWarning("ViewImage", "cannot load: " + e.getMessage()); Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
} }

@ -47,6 +47,7 @@ import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.SortStack; import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader; import de.anomic.http.server.RequestHeader;
import de.anomic.net.natLib; import de.anomic.net.natLib;
@ -206,7 +207,7 @@ public final class search {
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
"", // no navigation "", // no navigation
false, CrawlProfile.CacheStrategy.CACHEONLY,
count, count,
0, 0,
filter, filter,
@ -259,7 +260,7 @@ public final class search {
ContentDomain.contentdomParser(contentdom), ContentDomain.contentdomParser(contentdom),
language, language,
"", // no navigation "", // no navigation
false, CrawlProfile.CacheStrategy.CACHEONLY,
count, count,
0, 0,
filter, filter,

@ -51,6 +51,7 @@ import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.DidYouMean; import de.anomic.data.DidYouMean;
import de.anomic.data.LibraryProvider; import de.anomic.data.LibraryProvider;
import de.anomic.http.server.HeaderFramework; import de.anomic.http.server.HeaderFramework;
@ -67,7 +68,6 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsDB;
import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.graphics.ProfilingGraph; import de.anomic.yacy.graphics.ProfilingGraph;
@ -97,7 +97,8 @@ public class yacysearch {
// get query // get query
String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim(); String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim();
String querystring = originalquerystring.replace('+', ' '); String querystring = originalquerystring.replace('+', ' ');
boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true")); CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly"));
if (snippetFetchStrategy == null) snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
final serverObjects prop = new serverObjects(); final serverObjects prop = new serverObjects();
// get segment // get segment
@ -164,7 +165,7 @@ public class yacysearch {
// collect search attributes // collect search attributes
boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term
int itemsPerPage = Math.min((authenticated) ? (fetchSnippets ? 100 : 1000) : (fetchSnippets ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0)); int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));
boolean global = post.get("resource", "local").equals("global"); boolean global = post.get("resource", "local").equals("global");
@ -228,12 +229,12 @@ public class yacysearch {
boolean block = false; boolean block = false;
if (Domains.matchesList(client, sb.networkBlacklist)) { if (Domains.matchesList(client, sb.networkBlacklist)) {
global = false; global = false;
fetchSnippets = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true; block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search"); Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
} else if (Domains.matchesList(client, sb.networkWhitelist)) { } else if (Domains.matchesList(client, sb.networkWhitelist)) {
Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions"); Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
} else if (global || fetchSnippets) { } else if (global || snippetFetchStrategy.isAllowedToFetchOnline()) {
// in case that we do a global search or we want to fetch snippets, we check for DoS cases // in case that we do a global search or we want to fetch snippets, we check for DoS cases
synchronized (trackerHandles) { synchronized (trackerHandles) {
int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size(); int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size();
@ -242,21 +243,21 @@ public class yacysearch {
int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size(); int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
if (accInTenMinutes > 600) { if (accInTenMinutes > 600) {
global = false; global = false;
fetchSnippets = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true; block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)"); Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)");
} else if (accInOneMinute > 200) { } else if (accInOneMinute > 200) {
global = false; global = false;
fetchSnippets = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true; block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)"); Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)");
} else if (accInThreeSeconds > 1) { } else if (accInThreeSeconds > 1) {
global = false; global = false;
fetchSnippets = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets"); Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets");
} else if (accInOneSecond > 2) { } else if (accInOneSecond > 2) {
global = false; global = false;
fetchSnippets = false; snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets"); Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets");
} }
} }
@ -428,7 +429,7 @@ public class yacysearch {
if (urlentry != null) { if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata(); final URIMetadataRow.Components metadata = urlentry.metadata();
Document document; Document document;
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
if (document != null) { if (document != null) {
// create a news message // create a news message
final HashMap<String, String> map = new HashMap<String, String>(); final HashMap<String, String> map = new HashMap<String, String>();
@ -460,7 +461,7 @@ public class yacysearch {
contentdom, contentdom,
language, language,
navigation, navigation,
fetchSnippets, snippetFetchStrategy,
itemsPerPage, itemsPerPage,
offset, offset,
urlmask, urlmask,
@ -538,7 +539,7 @@ public class yacysearch {
"&maximumRecords="+ theQuery.displayResults() + "&maximumRecords="+ theQuery.displayResults() +
"&startRecord=" + (0 * theQuery.displayResults()) + "&startRecord=" + (0 * theQuery.displayResults()) +
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") + "&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") + "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
"&nav=" + theQuery.navigators + "&nav=" + theQuery.navigators +
"&urlmaskfilter=" + originalUrlMask.toString() + "&urlmaskfilter=" + originalUrlMask.toString() +
"&prefermaskfilter=" + theQuery.prefer.toString() + "&prefermaskfilter=" + theQuery.prefer.toString() +
@ -684,7 +685,7 @@ public class yacysearch {
prop.putHTML("prefermaskfilter", prefermask); prop.putHTML("prefermaskfilter", prefermask);
prop.put("indexof", (indexof) ? "on" : "off"); prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", (constraint == null) ? "" : constraint.exportB64()); prop.put("constraint", (constraint == null) ? "" : constraint.exportB64());
prop.put("verify", (fetchSnippets) ? "true" : "false"); prop.put("verify", snippetFetchStrategy.toName());
prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text"))); prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text")));
prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0); prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0);
prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0); prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0);

@ -259,6 +259,22 @@ public class CrawlProfile {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy; for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE; return NOCACHE;
} }
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
} }
public static class entry { public static class entry {

@ -563,7 +563,8 @@ public class CrawlQueues {
try { try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING); request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response response = sb.loader.load(request, maxFileSize); CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
if (response == null) { if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED); request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable; import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
@ -111,13 +112,13 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return o1.compareTo(o2); return o1.compareTo(o2);
} }
public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) { public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean reindexing) {
if (queryhashes.isEmpty()) { if (queryhashes.isEmpty()) {
Log.logFine("snippet fetch", "no query hashes given for url " + url); Log.logFine("snippet fetch", "no query hashes given for url " + url);
return new ArrayList<MediaSnippet>(); return new ArrayList<MediaSnippet>();
} }
final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE); final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE);
final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>(); final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
if (document != null) { if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO)); if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));

@ -44,6 +44,7 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.SetTools; import net.yacy.kelondro.util.SetTools;
import de.anomic.crawler.CrawlProfile;
import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacySeed;
public final class QueryParams { public final class QueryParams {
@ -79,7 +80,7 @@ public final class QueryParams {
public final int maxDistance; public final int maxDistance;
public final Bitfield constraint; public final Bitfield constraint;
public final boolean allofconstraint; public final boolean allofconstraint;
public final boolean onlineSnippetFetch; public final CrawlProfile.CacheStrategy snippetCacheStrategy;
public final RankingProfile ranking; public final RankingProfile ranking;
private final Segment indexSegment; private final Segment indexSegment;
public final String host; // this is the client host that starts the query, not a site operator public final String host; // this is the client host that starts the query, not a site operator
@ -130,7 +131,7 @@ public final class QueryParams {
this.domMaxTargets = 0; this.domMaxTargets = 0;
this.constraint = constraint; this.constraint = constraint;
this.allofconstraint = false; this.allofconstraint = false;
this.onlineSnippetFetch = false; this.snippetCacheStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
this.host = null; this.host = null;
this.sitehash = null; this.sitehash = null;
this.authorhash = null; this.authorhash = null;
@ -149,7 +150,7 @@ public final class QueryParams {
final int maxDistance, final String prefer, final ContentDomain contentdom, final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language, final String language,
final String navigators, final String navigators,
final boolean onlineSnippetFetch, final CrawlProfile.CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask, final int itemsPerPage, final int offset, final String urlMask,
final int domType, final int domMaxTargets, final int domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint, final Bitfield constraint, final boolean allofconstraint,
@ -184,7 +185,7 @@ public final class QueryParams {
this.allofconstraint = allofconstraint; this.allofconstraint = allofconstraint;
this.sitehash = site; assert site == null || site.length() == 6; this.sitehash = site; assert site == null || site.length() == 6;
this.authorhash = authorhash; assert authorhash == null || authorhash.length() > 0; this.authorhash = authorhash; assert authorhash == null || authorhash.length() > 0;
this.onlineSnippetFetch = onlineSnippetFetch; this.snippetCacheStrategy = snippetCacheStrategy;
this.host = host; this.host = host;
this.remotepeer = null; this.remotepeer = null;
this.handle = Long.valueOf(System.currentTimeMillis()); this.handle = Long.valueOf(System.currentTimeMillis());
@ -375,7 +376,7 @@ public final class QueryParams {
"&maximumRecords="+ theQuery.displayResults() + "&maximumRecords="+ theQuery.displayResults() +
"&startRecord=" + (page * theQuery.displayResults()) + "&startRecord=" + (page * theQuery.displayResults()) +
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") + "&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") + "&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
"&nav=" + nav + "&nav=" + nav +
"&urlmaskfilter=" + originalUrlMask + "&urlmaskfilter=" + originalUrlMask +
"&prefermaskfilter=" + theQuery.prefer + "&prefermaskfilter=" + theQuery.prefer +

@ -42,6 +42,7 @@ import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.SortStore; import net.yacy.kelondro.util.SortStore;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.MediaSnippet; import de.anomic.search.MediaSnippet;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph; import de.anomic.yacy.graphics.ProfilingGraph;
@ -105,9 +106,9 @@ public class ResultFetcher {
public void deployWorker(int deployCount, int neededResults) { public void deployWorker(int deployCount, int neededResults) {
if (anyWorkerAlive()) return; if (anyWorkerAlive()) return;
this.workerThreads = new Worker[(query.onlineSnippetFetch) ? deployCount : 1]; this.workerThreads = new Worker[(query.snippetCacheStrategy.isAllowedToFetchOnline()) ? deployCount : 1];
for (int i = 0; i < workerThreads.length; i++) { for (int i = 0; i < workerThreads.length; i++) {
this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0, neededResults); this.workerThreads[i] = new Worker(i, 10000, query.snippetCacheStrategy, neededResults);
this.workerThreads[i].start(); this.workerThreads[i].start();
} }
} }
@ -135,12 +136,12 @@ public class ResultFetcher {
private final long timeout; // the date until this thread should try to work private final long timeout; // the date until this thread should try to work
private long lastLifeSign; // when the last time the run()-loop was executed private long lastLifeSign; // when the last time the run()-loop was executed
private final int id; private final int id;
private final int snippetMode; private final CrawlProfile.CacheStrategy cacheStrategy;
private final int neededResults; private final int neededResults;
public Worker(final int id, final long maxlifetime, int snippetMode, int neededResults) { public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, int neededResults) {
this.id = id; this.id = id;
this.snippetMode = snippetMode; this.cacheStrategy = cacheStrategy;
this.lastLifeSign = System.currentTimeMillis(); this.lastLifeSign = System.currentTimeMillis();
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime); this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults; this.neededResults = neededResults;
@ -166,7 +167,7 @@ public class ResultFetcher {
if (page == null) break; if (page == null) break;
if (failedURLs.has(page.hash())) continue; if (failedURLs.has(page.hash())) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0 final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used if (resultEntry == null) continue; // the entry had some problems, cannot be used
if (result.exists(resultEntry)) continue; if (result.exists(resultEntry)) continue;
@ -195,7 +196,7 @@ public class ResultFetcher {
} }
} }
protected ResultEntry fetchSnippet(final URIMetadataRow page, final int snippetMode) { protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes: // Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets // 0 - do not fetch snippets
// 1 - fetch snippets offline only // 1 - fetch snippets offline only
@ -209,7 +210,7 @@ public class ResultFetcher {
if (metadata == null) return null; if (metadata == null) return null;
final long dbRetrievalTime = System.currentTimeMillis() - startTime; final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (snippetMode == 0) { if (cacheStrategy == null) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
} }
@ -221,10 +222,10 @@ public class ResultFetcher {
this.loader, this.loader,
metadata, metadata,
snippetFetchWordHashes, snippetFetchWordHashes,
(snippetMode == 2), cacheStrategy,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))), ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
180, 180,
(snippetMode == 2) ? Integer.MAX_VALUE : 30000, Integer.MAX_VALUE,
query.isGlobal()); query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime; final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@ -232,26 +233,26 @@ public class ResultFetcher {
if (snippet.getErrorCode() < 11) { if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet // we loaded the file and found the snippet
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (snippetMode == 1) { } else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result // we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster // this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else { } else {
// problems with snippet fetch // problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode());
return null; return null;
} }
} else { } else {
// attach media information // attach media information
startTime = System.currentTimeMillis(); startTime = System.currentTimeMillis();
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal()); final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime; final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if (mediaSnippets != null && !mediaSnippets.isEmpty()) { if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
// found media snippets, return entry // found media snippets, return entry
return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime); return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (snippetMode == 1) { } else if (cacheStrategy.mustBeOffline()) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
} else { } else {
// problems with snippet fetch // problems with snippet fetch

@ -62,6 +62,7 @@ import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist; import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
public class Segment { public class Segment {
@ -360,17 +361,23 @@ public class Segment {
// method for index deletion // method for index deletion
public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) { public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
return removeAllUrlReferences(url.hash(), loader, fetchOnline); return removeAllUrlReferences(url.hash(), loader, cacheStrategy);
} }
public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final boolean fetchOnline) { public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, fetchOnline); for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
} }
public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final boolean fetchOnline) { /**
// find all the words in a specific resource and remove the url reference from every word index * find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry * finally, delete the url entry
* @param urlhash the hash of the url that shall be removed
* @param loader
* @param cacheStrategy
* @return number of removed words
*/
public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
if (urlhash == null) return 0; if (urlhash == null) return 0;
// determine the url string // determine the url string
@ -384,7 +391,7 @@ public class Segment {
// get the resource content // get the resource content
byte[] resourceb = null; byte[] resourceb = null;
try { try {
resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false); resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false);
} catch (IOException e) { } catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage()); Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
} }

@ -48,6 +48,7 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray; import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.LoaderDispatcher; import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response; import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache; import de.anomic.http.client.Cache;
import de.anomic.http.server.ResponseHeader; import de.anomic.http.server.ResponseHeader;
@ -308,7 +309,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return al; return al;
} }
public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) { public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
// heise = "0OQUNU3JSs05" // heise = "0OQUNU3JSs05"
final DigestURI url = comp.url(); final DigestURI url = comp.url();
if (queryhashes.isEmpty()) { if (queryhashes.isEmpty()) {
@ -351,11 +352,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// trying to load the resource from the cache // trying to load the resource from the cache
resContent = Cache.getContent(url); resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url); responseHeader = Cache.getResponseHeader(url);
if ((resContent == null || responseHeader == null) && fetchOnline) { if ((resContent == null || responseHeader == null) && cacheStrategy.isAllowedToFetchOnline()) {
// if not found try to download it // if not found try to download it
// download resource using the crawler and keep resource in memory if possible // download resource or get it from the cache
final Response entry = loader.load(url, true, reindexing, Long.MAX_VALUE); final Response entry = loader.load(url, true, reindexing, cacheStrategy, Long.MAX_VALUE);
// get resource metadata (e.g. the http headers for http resources) // get resource metadata (e.g. the http headers for http resources)
if (entry != null) { if (entry != null) {
@ -371,10 +372,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
} }
} }
// if it is still not available, report an error
if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
source = SOURCE_WEB; source = SOURCE_WEB;
} }
if (resContent == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "omitted network load (not allowed), no cache entry");
}
// if it is still not available, report an error
return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
}
} }
} catch (final Exception e) { } catch (final Exception e) {
//Log.logException(e); //Log.logException(e);

@ -783,7 +783,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
} catch (final Exception e) { } catch (final Exception e) {
log.logSevere("command execution, generic exception " + e.getMessage() + " for client " + this.userAddress.getHostAddress(), e); log.logSevere("command execution, generic exception " + e.getMessage() + " for client " + this.userAddress.getHostAddress(), e);
// whatever happens: the thread has to survive! // whatever happens: the thread has to survive!
writeLine("UNKNOWN REASON:" + this.commandObj.error(e)); writeLine("UNKNOWN REASON:" + ((this.commandObj == null) ? "no command object" : this.commandObj.error(e)));
break; break;
} }
// check if we should still keep this alive: // check if we should still keep this alive:

@ -134,10 +134,11 @@ public class pdfParser extends AbstractParser implements Idiom {
Writer writer = null; Writer writer = null;
File writerFile = null; File writerFile = null;
PDFTextStripper stripper = null;
try { try {
// create a writer for output // create a writer for output
writer = new CharBuffer(); writer = new CharBuffer();
final PDFTextStripper stripper = new PDFTextStripper(); stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer); // may throw a NPE stripper.writeText(theDocument, writer); // may throw a NPE
theDocument.close(); theDocument.close();
writer.close(); writer.close();
@ -155,6 +156,7 @@ public class pdfParser extends AbstractParser implements Idiom {
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,"); if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
Document theDoc = null; Document theDoc = null;
if (docTitle == null) docTitle = docSubject;
if (writer instanceof CharBuffer) { if (writer instanceof CharBuffer) {
byte[] contentBytes; byte[] contentBytes;
@ -170,7 +172,7 @@ public class pdfParser extends AbstractParser implements Idiom {
"UTF-8", "UTF-8",
null, null,
docKeywords, docKeywords,
(docTitle == null) ? docSubject : docTitle, docTitle,
docAuthor, docAuthor,
docPublisher, docPublisher,
null, null,
@ -186,7 +188,7 @@ public class pdfParser extends AbstractParser implements Idiom {
"UTF-8", "UTF-8",
null, null,
docKeywords, docKeywords,
(docTitle == null) ? docSubject : docTitle, docTitle,
docAuthor, docAuthor,
docPublisher, docPublisher,
null, null,

@ -99,19 +99,11 @@ public final class LoaderDispatcher {
return (HashSet<String>) this.supportedProtocols.clone(); return (HashSet<String>) this.supportedProtocols.clone();
} }
public Response load(
final DigestURI url,
final boolean forText,
final boolean global,
final long maxFileSize) throws IOException {
return load(request(url, forText, global), maxFileSize);
}
/** /**
* load a resource from the web, from ftp, from smb or a file * load a resource from the web, from ftp, from smb or a file
* @param url * @param url
* @param forText * @param forText shows that this was a for-text crawling request
* @param global * @param global shows that this was a global crawling request
* @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY * @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY
* @return the loaded entity in a Response object * @return the loaded entity in a Response object
* @throws IOException * @throws IOException
@ -169,13 +161,6 @@ public final class LoaderDispatcher {
0); 0);
} }
public Response load(final Request request, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, cacheStrategy, maxFileSize);
}
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL // get the protocol of the next URL
final String protocol = request.url().getProtocol(); final String protocol = request.url().getProtocol();
@ -295,15 +280,10 @@ public final class LoaderDispatcher {
* @return the content as {@link byte[]} * @return the content as {@link byte[]}
* @throws IOException * @throws IOException
*/ */
public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException { public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
byte[] resource = Cache.getContent(url);
if (resource != null) return resource;
if (!fetchOnline) return null;
// try to download the resource using the loader // try to download the resource using the loader
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(url, forText, reindexing, maxFileSize); final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize);
if (entry == null) return null; // not found in web if (entry == null) return null; // not found in web
// read resource body (if it is there) // read resource body (if it is there)
@ -322,45 +302,27 @@ public final class LoaderDispatcher {
* @param global the domain of the search. If global == true then the content is re-indexed * @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link Document} * @return the parsed document as {@link Document}
*/ */
public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) { public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
// load resource // load resource
byte[] resContent = null; byte[] resContent = null;
ResponseHeader responseHeader = null; ResponseHeader responseHeader = null;
try { try {
// trying to load the resource from the cache final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize);
resContent = Cache.getContent(url); if (entry == null) {
responseHeader = Cache.getResponseHeader(url); Log.logFine("snippet fetch", "no Response for url " + url);
if (resContent != null) { return null;
// if the content was found }
} else if (fetchOnline) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
if (resourceArray != null) {
resContent = resourceArray;
} else {
resContent = Cache.getContent(url);
}
// read a fresh header
responseHeader = entry.getResponseHeader();
}
// if it is still not available, report an error // read resource body (if it is there)
if (resContent == null) { resContent = entry.getContent();
Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url);
return null; // read a fresh header
} responseHeader = entry.getResponseHeader();
} else {
Log.logFine("snippet fetch", "no resource available for url " + url); // if it is still not available, report an error
if (resContent == null || responseHeader == null) {
Log.logFine("snippet fetch", "no Content available for url " + url);
return null; return null;
} }
} catch (final Exception e) { } catch (final Exception e) {

Loading…
Cancel
Save