more abstraction of the htcache when using the LoaderDispatcher:

a cache access shall not made directly to the cache any more, all loading attempts shall use the LoaderDispatcher.
To control the usage of the cache, a enum instance from CrawlProfile.CacheStrategy shall be used.
Some direct loading methods without the usage of a cache strategy have been removed. This affects also the verify-option
of the yacysearch servlet. If there is a 'verify=false' now after this commit this does not necessarily mean that no snippets
are generated. Instead, all snippets that can be retrieved using the cache only are presented. This still means that the search hit was not verified because the snippet was generated using the cache. If a cache-based generation of snippets is not possible, then the verify=false causes that the link is not rejected.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6936 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent fd9f0714a3
commit 7bcfa033c9

@ -43,6 +43,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.BookmarkHelper;
import de.anomic.data.bookmarksDB;
import de.anomic.data.listManager;
@ -188,7 +189,7 @@ public class Bookmarks {
Document document = null;
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title());

@ -52,6 +52,7 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.listManager;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.QueryParams;
@ -162,7 +163,7 @@ public class IndexControlRWIs_p {
index = null;
}
if (delurlref) {
segment.removeAllUrlReferences(urlb, sb.loader, true);
segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
}
// delete the word first because that is much faster than the deletion of the urls from the url database
segment.termIndex().delete(keyhash);
@ -179,7 +180,7 @@ public class IndexControlRWIs_p {
// delete selected URLs
if (post.containsKey("keyhashdelete")) try {
if (delurlref) {
segment.removeAllUrlReferences(urlb, sb.loader, true);
segment.removeAllUrlReferences(urlb, sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
}
if (delurl || delurlref) {
for (byte[] b: urlb) sb.urlRemove(segment, b);

@ -38,6 +38,7 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.RotateIterator;
import net.yacy.kelondro.util.DateFormatter;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.MetadataRepository;
import de.anomic.search.Segment;
@ -140,7 +141,7 @@ public class IndexControlURLs_p {
prop.put("result", " ");
if (post.containsKey("urlhashdeleteall")) {
i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, true);
i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, CrawlProfile.CacheStrategy.IFEXIST);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
prop.put("lurlexport", 0);
prop.put("reload", 0);

@ -33,6 +33,7 @@ import net.yacy.document.ParserException;
import net.yacy.document.parser.rssParser;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
@ -63,7 +64,7 @@ public class RSSLoader_p {
// if the resource body was not cached we try to load it from web
Response entry = null;
try {
entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
entry = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
} catch (final Exception e) {
return prop;
}

@ -43,9 +43,9 @@ import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.server.RequestHeader;
@ -150,7 +150,7 @@ public class ViewFile {
}
// define an url by post parameter
url = new DigestURI(urlString, null);
url = new DigestURI(MultiProtocolURI.unescape(urlString), null);
urlHash = new String(url.hash());
pre = post.get("pre", "false").equals("true");
} catch (final MalformedURLException e) {}
@ -168,87 +168,35 @@ public class ViewFile {
// loading the resource content as byte array
prop.put("error_incache", Cache.has(url) ? 1 : 0);
ResponseHeader responseHeader = null;
String resMime = null;
ResponseHeader responseHeader = responseHeader = Cache.getResponseHeader(url);;
byte[] resource = Cache.getContent(url);
if (resource == null && authorized) {
if ((resource == null || responseHeader == null) && authorized) {
// load resource from net
Response response = null;
try {
response = sb.loader.load(url, true, false, Long.MAX_VALUE);
response = sb.loader.load(url, true, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
} catch (IOException e) {
Log.logException(e);
}
if (response != null) {
resource = response.getContent();
responseHeader = response.getResponseHeader();
}
}
if (responseHeader == null) responseHeader = Cache.getResponseHeader(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
Response entry = null;
try {
entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
if (entry != null) {
resource = entry.getContent();
}
if (resource == null) {
prop.put("error", "4");
prop.put("error_errorText", "No resource available");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
if (response != null) {
resource = response.getContent();
responseHeader = response.getResponseHeader();
}
}
// try to load resource metadata
if (responseHeader == null) {
// try to load the metadata from cache
try {
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
/* ignore this */
}
// if the metadata was not cached try to load it from web
if (responseHeader == null) {
final String protocol = url.getProtocol();
if (!((protocol.equals("http") || protocol.equals("https")))) {
prop.put("error", "6");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
try {
Response response = sb.loader.load(url, true, false, Long.MAX_VALUE);
responseHeader = response.getResponseHeader();
resource = response.getContent();
} catch (IOException e) {
Log.logException(e);
}
if (responseHeader == null) {
prop.put("error", "4");
prop.put("error_errorText", "Unable to load resource metadata.");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
}
} else {
resMime = responseHeader.mime();
// if resource not available just fail
if (resource == null || responseHeader == null) {
prop.put("error", "4");
prop.put("error_errorText", "No resource available");
prop.put("viewMode", VIEW_MODE_NO_TEXT);
return prop;
}
resMime = responseHeader.mime();
final String[] wordArray = wordArray(post.get("words", null));

@ -35,6 +35,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
@ -90,7 +91,7 @@ public class ViewImage {
if (scaled == null) {
byte[] resourceb = null;
if (url != null) try {
resourceb = sb.loader.getResource(url, true, timeout, false, true);
resourceb = sb.loader.getResource(url, CrawlProfile.CacheStrategy.IFEXIST, timeout, false, true);
} catch (IOException e) {
Log.logWarning("ViewImage", "cannot load: " + e.getMessage());
}

@ -47,6 +47,7 @@ import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.ISO639;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.net.natLib;
@ -206,7 +207,7 @@ public final class search {
ContentDomain.contentdomParser(contentdom),
language,
"", // no navigation
false,
CrawlProfile.CacheStrategy.CACHEONLY,
count,
0,
filter,
@ -259,7 +260,7 @@ public final class search {
ContentDomain.contentdomParser(contentdom),
language,
"", // no navigation
false,
CrawlProfile.CacheStrategy.CACHEONLY,
count,
0,
filter,

@ -51,6 +51,7 @@ import net.yacy.kelondro.util.SetTools;
import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.DidYouMean;
import de.anomic.data.LibraryProvider;
import de.anomic.http.server.HeaderFramework;
@ -67,7 +68,6 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsDB;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.graphics.ProfilingGraph;
@ -97,7 +97,8 @@ public class yacysearch {
// get query
String originalquerystring = (post == null) ? "" : post.get("query", post.get("search", "")).trim();
String querystring = originalquerystring.replace('+', ' ');
boolean fetchSnippets = (post != null && post.get("verify", "false").equals("true"));
CrawlProfile.CacheStrategy snippetFetchStrategy = (post != null && post.get("verify", "false").equals("true")) ? CrawlProfile.CacheStrategy.IFFRESH : CrawlProfile.CacheStrategy.parse(post.get("verify", "cacheonly"));
if (snippetFetchStrategy == null) snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
final serverObjects prop = new serverObjects();
// get segment
@ -164,7 +165,7 @@ public class yacysearch {
// collect search attributes
boolean newsearch = post.hasValue("query") && post.hasValue("former") && !post.get("query","").equalsIgnoreCase(post.get("former","")); //new search term
int itemsPerPage = Math.min((authenticated) ? (fetchSnippets ? 100 : 1000) : (fetchSnippets ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
int itemsPerPage = Math.min((authenticated) ? (snippetFetchStrategy.isAllowedToFetchOnline() ? 100 : 1000) : (snippetFetchStrategy.isAllowedToFetchOnline() ? 10 : 100), post.getInt("maximumRecords", post.getInt("count", 10))); // SRU syntax with old property as alternative
int offset = (newsearch) ? 0 : post.getInt("startRecord", post.getInt("offset", 0));
boolean global = post.get("resource", "local").equals("global");
@ -228,12 +229,12 @@ public class yacysearch {
boolean block = false;
if (Domains.matchesList(client, sb.networkBlacklist)) {
global = false;
fetchSnippets = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: BLACKLISTED CLIENT FROM " + client + " gets no permission to search");
} else if (Domains.matchesList(client, sb.networkWhitelist)) {
Log.logInfo("LOCAL_SEARCH", "ACCECC CONTROL: WHITELISTED CLIENT FROM " + client + " gets no search restrictions");
} else if (global || fetchSnippets) {
} else if (global || snippetFetchStrategy.isAllowedToFetchOnline()) {
// in case that we do a global search or we want to fetch snippets, we check for DoS cases
synchronized (trackerHandles) {
int accInOneSecond = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 1000)).size();
@ -242,21 +243,21 @@ public class yacysearch {
int accInTenMinutes = trackerHandles.tailSet(Long.valueOf(System.currentTimeMillis() - 600000)).size();
if (accInTenMinutes > 600) {
global = false;
fetchSnippets = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInTenMinutes + " searches in ten minutes, fully blocked (no results generated)");
} else if (accInOneMinute > 200) {
global = false;
fetchSnippets = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
block = true;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneMinute + " searches in one minute, fully blocked (no results generated)");
} else if (accInThreeSeconds > 1) {
global = false;
fetchSnippets = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInThreeSeconds + " searches in three seconds, blocked global search and snippets");
} else if (accInOneSecond > 2) {
global = false;
fetchSnippets = false;
snippetFetchStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
Log.logWarning("LOCAL_SEARCH", "ACCECC CONTROL: CLIENT FROM " + client + ": " + accInOneSecond + " searches in one second, blocked global search and snippets");
}
}
@ -428,7 +429,7 @@ public class yacysearch {
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
Document document;
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
document = LoaderDispatcher.retrieveDocument(metadata.url(), CrawlProfile.CacheStrategy.IFEXIST, 5000, true, false, Long.MAX_VALUE);
if (document != null) {
// create a news message
final HashMap<String, String> map = new HashMap<String, String>();
@ -460,7 +461,7 @@ public class yacysearch {
contentdom,
language,
navigation,
fetchSnippets,
snippetFetchStrategy,
itemsPerPage,
offset,
urlmask,
@ -538,7 +539,7 @@ public class yacysearch {
"&maximumRecords="+ theQuery.displayResults() +
"&startRecord=" + (0 * theQuery.displayResults()) +
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") +
"&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
"&nav=" + theQuery.navigators +
"&urlmaskfilter=" + originalUrlMask.toString() +
"&prefermaskfilter=" + theQuery.prefer.toString() +
@ -684,7 +685,7 @@ public class yacysearch {
prop.putHTML("prefermaskfilter", prefermask);
prop.put("indexof", (indexof) ? "on" : "off");
prop.put("constraint", (constraint == null) ? "" : constraint.exportB64());
prop.put("verify", (fetchSnippets) ? "true" : "false");
prop.put("verify", snippetFetchStrategy.toName());
prop.put("contentdom", (post == null ? "text" : post.get("contentdom", "text")));
prop.put("searchdomswitches", sb.getConfigBool("search.text", true) || sb.getConfigBool("search.audio", true) || sb.getConfigBool("search.video", true) || sb.getConfigBool("search.image", true) || sb.getConfigBool("search.app", true) ? 1 : 0);
prop.put("searchdomswitches_searchtext", sb.getConfigBool("search.text", true) ? 1 : 0);

@ -259,6 +259,22 @@ public class CrawlProfile {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
public static CacheStrategy parse(String name) {
if (name.equals("nocache")) return NOCACHE;
if (name.equals("iffresh")) return IFFRESH;
if (name.equals("ifexist")) return IFEXIST;
if (name.equals("cacheonly")) return CACHEONLY;
return null;
}
public String toName() {
return this.name().toLowerCase();
}
public boolean isAllowedToFetchOnline() {
return this.code < 3;
}
public boolean mustBeOffline() {
return this.code == 3;
}
}
public static class entry {

@ -563,7 +563,8 @@ public class CrawlQueues {
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response response = sb.loader.load(request, maxFileSize);
CrawlProfile.entry e = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
Response response = sb.loader.load(request, e == null ? CrawlProfile.CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -30,6 +30,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.MimeTable;
import net.yacy.cora.document.MultiProtocolURI;
@ -111,13 +112,13 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return o1.compareTo(o2);
}
public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final boolean fetchOnline, final int timeout, final boolean reindexing) {
public static ArrayList<MediaSnippet> retrieveMediaSnippets(final DigestURI url, final HandleSet queryhashes, final ContentDomain mediatype, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean reindexing) {
if (queryhashes.isEmpty()) {
Log.logFine("snippet fetch", "no query hashes given for url " + url);
return new ArrayList<MediaSnippet>();
}
final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE);
final Document document = LoaderDispatcher.retrieveDocument(url, cacheStrategy, timeout, false, reindexing, Long.MAX_VALUE);
final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));

@ -44,6 +44,7 @@ import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.SetTools;
import de.anomic.crawler.CrawlProfile;
import de.anomic.yacy.yacySeed;
public final class QueryParams {
@ -79,7 +80,7 @@ public final class QueryParams {
public final int maxDistance;
public final Bitfield constraint;
public final boolean allofconstraint;
public final boolean onlineSnippetFetch;
public final CrawlProfile.CacheStrategy snippetCacheStrategy;
public final RankingProfile ranking;
private final Segment indexSegment;
public final String host; // this is the client host that starts the query, not a site operator
@ -130,7 +131,7 @@ public final class QueryParams {
this.domMaxTargets = 0;
this.constraint = constraint;
this.allofconstraint = false;
this.onlineSnippetFetch = false;
this.snippetCacheStrategy = CrawlProfile.CacheStrategy.CACHEONLY;
this.host = null;
this.sitehash = null;
this.authorhash = null;
@ -149,7 +150,7 @@ public final class QueryParams {
final int maxDistance, final String prefer, final ContentDomain contentdom,
final String language,
final String navigators,
final boolean onlineSnippetFetch,
final CrawlProfile.CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask,
final int domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint,
@ -184,7 +185,7 @@ public final class QueryParams {
this.allofconstraint = allofconstraint;
this.sitehash = site; assert site == null || site.length() == 6;
this.authorhash = authorhash; assert authorhash == null || authorhash.length() > 0;
this.onlineSnippetFetch = onlineSnippetFetch;
this.snippetCacheStrategy = snippetCacheStrategy;
this.host = host;
this.remotepeer = null;
this.handle = Long.valueOf(System.currentTimeMillis());
@ -375,7 +376,7 @@ public final class QueryParams {
"&maximumRecords="+ theQuery.displayResults() +
"&startRecord=" + (page * theQuery.displayResults()) +
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") +
"&verify=" + (theQuery.snippetCacheStrategy.mustBeOffline() ? "false" : "true") +
"&nav=" + nav +
"&urlmaskfilter=" + originalUrlMask +
"&prefermaskfilter=" + theQuery.prefer +

@ -42,6 +42,7 @@ import net.yacy.kelondro.util.SortStack;
import net.yacy.kelondro.util.SortStore;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.MediaSnippet;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph;
@ -105,9 +106,9 @@ public class ResultFetcher {
public void deployWorker(int deployCount, int neededResults) {
if (anyWorkerAlive()) return;
this.workerThreads = new Worker[(query.onlineSnippetFetch) ? deployCount : 1];
this.workerThreads = new Worker[(query.snippetCacheStrategy.isAllowedToFetchOnline()) ? deployCount : 1];
for (int i = 0; i < workerThreads.length; i++) {
this.workerThreads[i] = new Worker(i, 10000, (query.onlineSnippetFetch) ? 2 : 0, neededResults);
this.workerThreads[i] = new Worker(i, 10000, query.snippetCacheStrategy, neededResults);
this.workerThreads[i].start();
}
}
@ -135,12 +136,12 @@ public class ResultFetcher {
private final long timeout; // the date until this thread should try to work
private long lastLifeSign; // when the last time the run()-loop was executed
private final int id;
private final int snippetMode;
private final CrawlProfile.CacheStrategy cacheStrategy;
private final int neededResults;
public Worker(final int id, final long maxlifetime, int snippetMode, int neededResults) {
public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, int neededResults) {
this.id = id;
this.snippetMode = snippetMode;
this.cacheStrategy = cacheStrategy;
this.lastLifeSign = System.currentTimeMillis();
this.timeout = System.currentTimeMillis() + Math.max(1000, maxlifetime);
this.neededResults = neededResults;
@ -166,7 +167,7 @@ public class ResultFetcher {
if (page == null) break;
if (failedURLs.has(page.hash())) continue;
final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0
final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used
if (result.exists(resultEntry)) continue;
@ -195,7 +196,7 @@ public class ResultFetcher {
}
}
protected ResultEntry fetchSnippet(final URIMetadataRow page, final int snippetMode) {
protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
@ -209,7 +210,7 @@ public class ResultFetcher {
if (metadata == null) return null;
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (snippetMode == 0) {
if (cacheStrategy == null) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
}
@ -221,10 +222,10 @@ public class ResultFetcher {
this.loader,
metadata,
snippetFetchWordHashes,
(snippetMode == 2),
cacheStrategy,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
180,
(snippetMode == 2) ? Integer.MAX_VALUE : 30000,
Integer.MAX_VALUE,
query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
@ -232,26 +233,26 @@ public class ResultFetcher {
if (snippet.getErrorCode() < 11) {
// we loaded the file and found the snippet
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (snippetMode == 1) {
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
registerFailure(page.hash(), "no text snippet for URL " + metadata.url());
registerFailure(page.hash(), "no text snippet for URL " + metadata.url() + "; errorCode = " + snippet.getErrorCode());
return null;
}
} else {
// attach media information
startTime = System.currentTimeMillis();
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal());
final ArrayList<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, query.isGlobal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
// found media snippets, return entry
return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (snippetMode == 1) {
} else if (cacheStrategy.mustBeOffline()) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch

@ -62,6 +62,7 @@ import net.yacy.kelondro.util.ISO639;
import net.yacy.repository.Blacklist;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
public class Segment {
@ -360,18 +361,24 @@ public class Segment {
// method for index deletion
public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final boolean fetchOnline) {
return removeAllUrlReferences(url.hash(), loader, fetchOnline);
public int removeAllUrlReferences(final DigestURI url, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
return removeAllUrlReferences(url.hash(), loader, cacheStrategy);
}
public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final boolean fetchOnline) {
for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, fetchOnline);
public void removeAllUrlReferences(final HandleSet urls, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
for (byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
}
public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final boolean fetchOnline) {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry
/**
* find all the words in a specific resource and remove the url reference from every word index
* finally, delete the url entry
* @param urlhash the hash of the url that shall be removed
* @param loader
* @param cacheStrategy
* @return number of removed words
*/
public int removeAllUrlReferences(final byte[] urlhash, LoaderDispatcher loader, final CrawlProfile.CacheStrategy cacheStrategy) {
if (urlhash == null) return 0;
// determine the url string
final URIMetadataRow entry = urlMetadata().load(urlhash, null, 0);
@ -384,7 +391,7 @@ public class Segment {
// get the resource content
byte[] resourceb = null;
try {
resourceb = loader.getResource(metadata.url(), fetchOnline, 10000, true, false);
resourceb = loader.getResource(metadata.url(), cacheStrategy, 10000, true, false);
} catch (IOException e) {
Log.logWarning("removeAllUrlReferences", "cannot load: " + e.getMessage());
}

@ -48,6 +48,7 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Cache;
import de.anomic.http.server.ResponseHeader;
@ -308,7 +309,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return al;
}
public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final boolean fetchOnline, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
public static TextSnippet retrieveTextSnippet(final LoaderDispatcher loader, final URIMetadataRow.Components comp, final HandleSet queryhashes, final CrawlProfile.CacheStrategy cacheStrategy, final boolean pre, final int snippetMaxLength, final int maxDocLen, final boolean reindexing) {
// heise = "0OQUNU3JSs05"
final DigestURI url = comp.url();
if (queryhashes.isEmpty()) {
@ -351,11 +352,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// trying to load the resource from the cache
resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url);
if ((resContent == null || responseHeader == null) && fetchOnline) {
if ((resContent == null || responseHeader == null) && cacheStrategy.isAllowedToFetchOnline()) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = loader.load(url, true, reindexing, Long.MAX_VALUE);
// download resource or get it from the cache
final Response entry = loader.load(url, true, reindexing, cacheStrategy, Long.MAX_VALUE);
// get resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -371,10 +372,17 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
}
}
// if it is still not available, report an error
if (resContent == null) return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
source = SOURCE_WEB;
}
if (resContent == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) {
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "omitted network load (not allowed), no cache entry");
}
// if it is still not available, report an error
return new TextSnippet(url, null, ERROR_RESOURCE_LOADING, queryhashes, "error loading resource from net, no cache entry");
}
}
} catch (final Exception e) {
//Log.logException(e);

@ -783,7 +783,7 @@ public final class serverCore extends AbstractBusyThread implements BusyThread {
} catch (final Exception e) {
log.logSevere("command execution, generic exception " + e.getMessage() + " for client " + this.userAddress.getHostAddress(), e);
// whatever happens: the thread has to survive!
writeLine("UNKNOWN REASON:" + this.commandObj.error(e));
writeLine("UNKNOWN REASON:" + ((this.commandObj == null) ? "no command object" : this.commandObj.error(e)));
break;
}
// check if we should still keep this alive:

@ -134,10 +134,11 @@ public class pdfParser extends AbstractParser implements Idiom {
Writer writer = null;
File writerFile = null;
PDFTextStripper stripper = null;
try {
// create a writer for output
writer = new CharBuffer();
final PDFTextStripper stripper = new PDFTextStripper();
stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer); // may throw a NPE
theDocument.close();
writer.close();
@ -150,11 +151,12 @@ public class pdfParser extends AbstractParser implements Idiom {
if (writerFile != null) FileUtils.deletedelete(writerFile);
throw new ParserException(e.getMessage(), location);
}
String[] docKeywords = null;
if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
Document theDoc = null;
if (docTitle == null) docTitle = docSubject;
if (writer instanceof CharBuffer) {
byte[] contentBytes;
@ -170,7 +172,7 @@ public class pdfParser extends AbstractParser implements Idiom {
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docTitle,
docAuthor,
docPublisher,
null,
@ -186,7 +188,7 @@ public class pdfParser extends AbstractParser implements Idiom {
"UTF-8",
null,
docKeywords,
(docTitle == null) ? docSubject : docTitle,
docTitle,
docAuthor,
docPublisher,
null,

@ -99,19 +99,11 @@ public final class LoaderDispatcher {
return (HashSet<String>) this.supportedProtocols.clone();
}
public Response load(
final DigestURI url,
final boolean forText,
final boolean global,
final long maxFileSize) throws IOException {
return load(request(url, forText, global), maxFileSize);
}
/**
* load a resource from the web, from ftp, from smb or a file
* @param url
* @param forText
* @param global
* @param forText shows that this was a for-text crawling request
* @param global shows that this was a global crawling request
* @param cacheStratgy strategy according to CACHE_STRATEGY_NOCACHE,CACHE_STRATEGY_IFFRESH,CACHE_STRATEGY_IFEXIST,CACHE_STRATEGY_CACHEONLY
* @return the loaded entity in a Response object
* @throws IOException
@ -169,13 +161,6 @@ public final class LoaderDispatcher {
0);
}
public Response load(final Request request, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, cacheStrategy, maxFileSize);
}
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
@ -295,15 +280,10 @@ public final class LoaderDispatcher {
* @return the content as {@link byte[]}
* @throws IOException
*/
public byte[] getResource(final DigestURI url, final boolean fetchOnline, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
byte[] resource = Cache.getContent(url);
if (resource != null) return resource;
if (!fetchOnline) return null;
public byte[] getResource(final DigestURI url, CrawlProfile.CacheStrategy cacheStrategy, final int socketTimeout, final boolean forText, final boolean reindexing) throws IOException {
// try to download the resource using the loader
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(url, forText, reindexing, maxFileSize);
final Response entry = load(url, forText, reindexing, cacheStrategy, maxFileSize);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -322,45 +302,27 @@ public final class LoaderDispatcher {
* @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link Document}
*/
public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
public static Document retrieveDocument(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
// load resource
byte[] resContent = null;
ResponseHeader responseHeader = null;
try {
// trying to load the resource from the cache
resContent = Cache.getContent(url);
responseHeader = Cache.getResponseHeader(url);
if (resContent != null) {
// if the content was found
} else if (fetchOnline) {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, cacheStrategy, maxFileSize);
if (entry == null) {
Log.logFine("snippet fetch", "no Response for url " + url);
return null;
}
// read resource body (if it is there)
final byte[] resourceArray = entry.getContent();
if (resourceArray != null) {
resContent = resourceArray;
} else {
resContent = Cache.getContent(url);
}
// read a fresh header
responseHeader = entry.getResponseHeader();
}
// if it is still not available, report an error
if (resContent == null) {
Log.logFine("snippet fetch", "plasmaHTCache.Entry cache is NULL for url " + url);
return null;
}
} else {
Log.logFine("snippet fetch", "no resource available for url " + url);
// read resource body (if it is there)
resContent = entry.getContent();
// read a fresh header
responseHeader = entry.getResponseHeader();
// if it is still not available, report an error
if (resContent == null || responseHeader == null) {
Log.logFine("snippet fetch", "no Content available for url " + url);
return null;
}
} catch (final Exception e) {

Loading…
Cancel
Save