redesign of access to the HTCache (now http.client.Cache):

- better control to the cache by using combined request-header and content access methods
- refactoring of many classes to comply to this new access method
- make shure that the cache is always written if something was loaded
- some redesign of the process how http response results are feeded into the new indexing queue
- introduction of a cache read policy:
 * never use the cache
 * use the cache if entry exist
 * use the cache if the proxy freshness rule confirmes
 * use only the cache and go never online
- added configuration options for the crawl profiles to use the new cache policies. There is not yet a input during crawl start to set the policy but this will be added in another step.
- set the default policies for the existing crawl profiles. If you want them to appear in your default profiles you must delete the crawl profiles database; othervise the policy is 'proxy freshness rule'
- enhanced some cache access methods in such a way that unnecessary retrievals are omitted (i.e. for size computation). That should reduce some IO but also a lot of CPU computation because sizes were computed after decompression of content after retrieval of the content from the disc.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6239 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent da43164dd6
commit 161d2fd2ef

@ -147,7 +147,8 @@ public class QuickCrawlLink_p {
remoteIndexing,
xsstopw,
xdstopw,
xpstopw
xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH
);
} catch (final Exception e) {
// mist

@ -146,15 +146,15 @@ public class ViewFile {
ResponseHeader responseHeader = null;
String resMime = null;
// trying to load the resource body
resource = Cache.getResourceContentStream(url);
resource = Cache.getContentStream(url);
resourceLength = Cache.getResourceContentLength(url);
responseHeader = Cache.loadResponseHeader(url);
responseHeader = Cache.getResponseHeader(url);
// if the resource body was not cached we try to load it from web
if (resource == null) {
Response entry = null;
try {
entry = sb.crawlQueues.loadResourceFromWeb(url, true, false);
entry = sb.loader.load(url, true, false);
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
@ -163,7 +163,7 @@ public class ViewFile {
}
if (entry != null) {
resource = Cache.getResourceContentStream(url);
resource = Cache.getContentStream(url);
resourceLength = Cache.getResourceContentLength(url);
}
@ -180,7 +180,7 @@ public class ViewFile {
// try to load the metadata from cache
try {
responseHeader = Cache.loadResponseHeader(url);
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
/* ignore this */
}

@ -224,7 +224,7 @@ public class WatchCrawler_p {
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, CrawlProfile.CACHE_STRATEGY_IFFRESH);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash,
url,
@ -350,7 +350,8 @@ public class WatchCrawler_p {
storeHTCache,
true,
crawlOrder,
xsstopw, xdstopw, xpstopw);
xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
// pause local crawl here
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
@ -408,7 +409,9 @@ public class WatchCrawler_p {
crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
storeHTCache, true, crawlOrder,
xsstopw, xdstopw, xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
// create a new sitemap importer
final SitemapImporter importerThread = new SitemapImporter(sb, sb.dbImportManager, new yacyURL(sitemapURLStr, null), pe);

@ -168,7 +168,8 @@ public class CrawlProfile {
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final int cacheStrategy) {
final entry ne = new entry(
name, startURL,
@ -179,7 +180,8 @@ public class CrawlProfile {
indexText, indexMedia,
storeHTCache, storeTXCache,
remoteIndexing,
xsstopw, xdstopw, xpstopw);
xsstopw, xdstopw, xpstopw,
cacheStrategy);
try {
profileTable.put(ne.handle(), ne.map());
} catch (final kelondroException e) {
@ -247,6 +249,11 @@ public class CrawlProfile {
}
public final static int CACHE_STRATEGY_NOCACHE = 0;
public final static int CACHE_STRATEGY_IFEXIST = 1;
public final static int CACHE_STRATEGY_IFFRESH = 2;
public final static int CACHE_STRATEGY_CACHEONLY = 3;
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
@ -268,6 +275,7 @@ public class CrawlProfile {
public static final String XSSTOPW = "xsstopw";
public static final String XDSTOPW = "xdstopw";
public static final String XPSTOPW = "xpstopw";
public static final String CACHE_STRAGEGY = "cacheStrategy";
Map<String, String> mem;
private ConcurrentHashMap<String, DomProfile> doms;
@ -284,7 +292,8 @@ public class CrawlProfile {
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw) {
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final int cacheStrategy) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, yacySeedDB.commonHashLength) : startURL.hash();
mem = new HashMap<String, String>();
@ -306,7 +315,7 @@ public class CrawlProfile {
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy));
doms = new ConcurrentHashMap<String, DomProfile>();
}
@ -368,6 +377,15 @@ public class CrawlProfile {
return 0;
}
}
public int cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY);
if (r == null) return CACHE_STRATEGY_IFFRESH;
try {
return Integer.parseInt(r);
} catch (final NumberFormatException e) {
return CACHE_STRATEGY_IFFRESH;
}
}
public long recrawlIfOlder() {
// returns a long (millis) that is the minimum age that
// an entry must have to be re-crawled

@ -38,7 +38,6 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.parser.xml.RSSFeed;
import de.anomic.http.client.Client;
@ -59,7 +58,6 @@ public class CrawlQueues {
protected Switchboard sb;
protected Log log;
protected Map<Integer, crawlWorker> workers; // mapping from url hash to Worker thread object
protected LoaderDispatcher loader;
private final ArrayList<String> remoteCrawlProviderHashes;
public NoticedURL noticeURL;
@ -69,7 +67,6 @@ public class CrawlQueues {
this.sb = sb;
this.log = new Log("CRAWLER");
this.workers = new ConcurrentHashMap<Integer, crawlWorker>();
this.loader = new LoaderDispatcher(sb, log);
this.remoteCrawlProviderHashes = new ArrayList<String>();
// start crawling management
@ -94,7 +91,7 @@ public class CrawlQueues {
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
for (final crawlWorker worker: workers.values()) {
if (worker.entry.url().hash().equals(hash)) return "worker";
if (worker.request.url().hash().equals(hash)) return "worker";
}
return null;
}
@ -115,7 +112,7 @@ public class CrawlQueues {
ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
for (final crawlWorker w: workers.values()) {
if (w.entry.url().hash().equals(urlhash)) return w.entry.url();
if (w.request.url().hash().equals(urlhash)) return w.request.url();
}
return null;
}
@ -170,15 +167,11 @@ public class CrawlQueues {
synchronized (workers) {
final Request[] e = new Request[workers.size()];
int i = 0;
for (final crawlWorker w: workers.values()) e[i++] = w.entry;
for (final crawlWorker w: workers.values()) e[i++] = w.request;
return e;
}
}
public boolean isSupportedProtocol(final String protocol) {
return loader.isSupportedProtocol(protocol);
}
public int coreCrawlJobSize() {
return noticeURL.stackSize(NoticedURL.STACK_TYPE_CORE);
}
@ -243,7 +236,7 @@ public class CrawlQueues {
// check if the protocol is supported
final yacyURL url = urlEntry.url();
final String urlProtocol = url.getProtocol();
if (this.isSupportedProtocol(urlProtocol)) {
if (sb.loader.isSupportedProtocol(urlProtocol)) {
if (this.log.isFine())
log.logFine(stats + ": URL=" + urlEntry.url()
@ -494,48 +487,20 @@ public class CrawlQueues {
}
}
public Response loadResourceFromWeb(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
final Request centry = new Request(
sb.peers.mySeed().hash,
url,
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() :
sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
return loader.load(centry);
}
public int size() {
return workers.size();
}
protected final class crawlWorker extends Thread {
protected Request entry;
protected Request request;
private final Integer code;
private long start;
public crawlWorker(final Request entry) {
this.start = System.currentTimeMillis();
this.entry = entry;
this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
this.request = entry;
this.request.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
if (!workers.containsKey(code)) {
workers.put(code, this);
@ -550,39 +515,57 @@ public class CrawlQueues {
public void run() {
try {
// checking robots.txt for http(s) resources
this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
this.request.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
if ((request.url().getProtocol().equals("http") || request.url().getProtocol().equals("https")) && sb.robots.isDisallowed(request.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + request.url().toString() + "' disallowed by robots.txt.");
final ZURL.Entry eentry = errorURL.newEntry(
this.entry,
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
"denied by robots.txt");
eentry.store();
errorURL.push(eentry);
this.entry.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED);
this.request.setStatus("worker-disallowed", serverProcessorJob.STATUS_FINISHED);
} else {
// starting a load from the internet
this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
final String result = loader.process(this.entry);
this.request.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
String result = null;
// load a resource, store it to htcache and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
Response response;
try {
request.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
response = sb.loader.load(request);
assert response != null;
request.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.toIndexer(response);
request.setStatus("enqueued-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
result = (stored) ? null : "not enqueued to indexer";
} catch (IOException e) {
request.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": " + e.getMessage());
result = "load error - " + e.getMessage();
}
if (result != null) {
final ZURL.Entry eentry = errorURL.newEntry(
this.entry,
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
"cannot load: " + result);
eentry.store();
errorURL.push(eentry);
this.entry.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED);
this.request.setStatus("worker-error", serverProcessorJob.STATUS_FINISHED);
} else {
this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
this.request.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
final ZURL.Entry eentry = errorURL.newEntry(
this.entry,
this.request,
sb.peers.mySeed().hash,
new Date(),
1,
@ -591,7 +574,7 @@ public class CrawlQueues {
errorURL.push(eentry);
e.printStackTrace();
Client.initConnectionManager();
this.entry.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED);
this.request.setStatus("worker-exception", serverProcessorJob.STATUS_FINISHED);
} finally {
crawlWorker w = workers.remove(code);
assert w != null;

@ -46,7 +46,7 @@ public final class CrawlStacker {
private Log log = new Log("STACKCRAWL");
private serverProcessor<Request> fastQueue, slowQueue;
private serverProcessor<Request> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
private CrawlQueues nextQueue;
private CrawlSwitchboard crawler;
@ -177,7 +177,7 @@ public final class CrawlStacker {
// check if the protocol is supported
final String urlProtocol = entry.url().getProtocol();
if (!nextQueue.isSupportedProtocol(urlProtocol)) {
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
this.log.logSevere("Unsupported protocol in URL '" + entry.url().toString() + "'. " +
"Stack processing time: " + (System.currentTimeMillis() - startTime) + "ms");
return "unsupported protocol";

@ -178,37 +178,38 @@ public final class CrawlSwitchboard {
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true);
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false);
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE);
}
}

@ -34,7 +34,6 @@ import java.util.Date;
import de.anomic.crawler.Latency;
import de.anomic.document.Parser;
import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
@ -110,20 +109,22 @@ public class FTPLoader {
// directory -> get list of files
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
Cache.storeMetadata(request.url(), responseHeader);
byte[] dirList = generateDirlist(ftpClient, request, path);
if (dirList == null) {
response = null;
} else {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
dirList);
}
} else {
// file -> download
@ -236,6 +237,9 @@ public class FTPLoader {
// determine the file date
final Date fileDate = ftpClient.entryDate(path);
// download the remote file
byte[] b = ftpClient.get(path);
// create a cache entry
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
@ -247,12 +251,8 @@ public class FTPLoader {
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
Cache.storeMetadata(request.url(), responseHeader);
// download the remote file
byte[] b = ftpClient.get(path);
response.setContent(b);
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");

@ -32,7 +32,6 @@ import de.anomic.crawler.Latency;
import de.anomic.data.Blacklist;
import de.anomic.document.Parser;
import de.anomic.http.client.Client;
import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseContainer;
@ -135,18 +134,6 @@ public final class HTTPLoader {
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// create a new cache entry
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())
);
Cache.storeMetadata(request.url(), res.getResponseHeader());
// request has been placed and result has been returned. work off response
// if the response has not the right file type then reject file
supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
@ -165,7 +152,15 @@ public final class HTTPLoader {
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
response.setContent(responseBody);
// create a new cache entry
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
responseBody
);
return response;
} else if (res.getStatusLine().startsWith("30")) {

@ -28,14 +28,20 @@ package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
public final class LoaderDispatcher {
@ -44,17 +50,17 @@ public final class LoaderDispatcher {
private static final ConcurrentHashMap<String, Long> accessTime = new ConcurrentHashMap<String, Long>(); // to protect targets from DDoS
private final Switchboard sb;
private final Log log;
private final HashSet<String> supportedProtocols;
private final HTTPLoader httpLoader;
private final FTPLoader ftpLoader;
private final Log log;
public LoaderDispatcher(final Switchboard sb, final Log log) {
public LoaderDispatcher(final Switchboard sb) {
this.sb = sb;
this.log = log;
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp"}));
// initiate loader objects
this.log = new Log("LOADER");
httpLoader = new HTTPLoader(sb, log);
ftpLoader = new FTPLoader(sb, log);
}
@ -69,17 +75,100 @@ public final class LoaderDispatcher {
return (HashSet<String>) this.supportedProtocols.clone();
}
public Response load(final Request entry) throws IOException {
// getting the protocol of the next URL
final String protocol = entry.url().getProtocol();
final String host = entry.url().getHost();
public Response load(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
final Request centry = new Request(
sb.peers.mySeed().hash,
url,
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() :
sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
return load(centry);
}
public Response load(final Request request) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
// check if this loads a page from localhost, which must be prevented to protect the server
// against attacks to the administration interface when localhost access is granted
if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + entry.url());
if (serverCore.isLocalhost(host) && sb.getConfigBool("adminAccountForLocalhost", false)) throw new IOException("access to localhost not granted for url " + request.url());
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
ResponseHeader cachedResponse = (request.url().isLocal()) ? null : Cache.getResponseHeader(request.url());
byte[] content = (cachedResponse == null) ? null : Cache.getContent(request.url());
if (cachedResponse != null && content != null) {
// yes we have the content
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
yacyURL refererURL = null;
if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
Response response = new Response(
request,
requestHeader,
cachedResponse,
"200",
crawlProfile,
content);
// check which caching strategy shall be used
if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false));
return response;
} else {
log.logInfo("cache hit/stale for: " + request.url().toNormalform(true, false));
}
}
}
// check access time
if (!entry.url().isLocal()) {
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
return null;
}
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make shure that we don't DoS the target by mistake
if (!request.url().isLocal()) {
final Long lastAccess = accessTime.get(host);
long wait = 0;
if (lastAccess != null) wait = Math.max(0, minDelay + lastAccess.longValue() - System.currentTimeMillis());
@ -91,13 +180,26 @@ public final class LoaderDispatcher {
try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {}
}
}
// now it's for shure that we will access the target. Remember the access time
accessTime.put(host, System.currentTimeMillis());
// load resource
if ((protocol.equals("http") || (protocol.equals("https")))) return httpLoader.load(entry);
if (protocol.equals("ftp")) return ftpLoader.load(entry);
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request);
if (protocol.equals("ftp")) response = ftpLoader.load(request);
if (response != null) {
// we got something. Now check if we want to store that to the cache
String storeError = response.shallStoreCache();
if (storeError == null) {
Cache.store(request.url(), response.getResponseHeader(), response.getContent());
} else {
if (Cache.log.isFine()) Cache.log.logFine("no storage of url " + request.url() + ": " + storeError);
}
return response;
}
throw new IOException("Unsupported protocol '" + protocol + "' in url " + entry.url());
throw new IOException("Unsupported protocol '" + protocol + "' in url " + request.url());
}
public synchronized void cleanupAccessTimeTable(long timeout) {
@ -109,24 +211,4 @@ public final class LoaderDispatcher {
if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
}
}
public String process(final Request entry) {
// load a resource, store it to htcache and push queue entry to switchboard queue
// returns null if everything went fine, a fail reason string if a problem occurred
Response h;
try {
entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
h = load(entry);
assert h != null;
entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.htEntryStoreProcess(h);
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
return (stored) ? null : "not stored";
} catch (IOException e) {
entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + entry.url().toString() + ": " + e.getMessage());
return "load error - " + e.getMessage();
}
}
}

@ -1,4 +1,4 @@
// CrawlEntry.java
// Request.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 14.03.2007 on http://yacy.net
//
@ -69,7 +69,7 @@ public class Request extends serverProcessorJob {
private long loaddate; // the time when the url was loaded
private long serverdate; // the document date from the target server
private long imsdate; // the time of a ifModifiedSince request
private String profileHandle; // the name of the prefetch profile
private String profileHandle; // the name of the fetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
@ -80,7 +80,7 @@ public class Request extends serverProcessorJob {
/**
* A HarvestRequest Entry is a object that is created to provide
* A Request Entry is a object that is created to provide
* all information to load a specific resource.
*
* @param initiator the hash of the initiator peer

@ -145,7 +145,8 @@ public class Response {
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile) {
final CrawlProfile.entry profile,
final byte[] content) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
this.requestHeader = requestHeader;
@ -153,15 +154,26 @@ public class Response {
this.responseStatus = responseStatus;
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
// to be defined later:
this.content = null;
this.content = content;
}
public Response(
Request request,
final RequestHeader requestHeader,
final ResponseHeader responseHeader,
final String responseStatus,
final CrawlProfile.entry profile) {
this(request, requestHeader, responseHeader, responseStatus, profile, null);
}
public void updateStatus(final int newStatus) {
this.status = newStatus;
}
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public int getStatus() {
return this.status;
}
@ -241,7 +253,7 @@ public class Response {
* @return NULL if the answer is TRUE, in case of FALSE, the reason as
* String is returned
*/
public String shallStoreCacheForProxy() {
public String shallStoreCache() {
// check profile (disabled: we will check this in the plasmaSwitchboard)
// if (!this.profile.storeHTCache()) { return "storage_not_wanted"; }
@ -252,7 +264,7 @@ public class Response {
// check storage size: all files will be handled in RAM before storage, so they must not exceed
// a given size, which we consider as 1MB
if (this.size() > 1024L * 1024L) return "too_large_for_caching_" + this.size();
if (this.size() > 10 * 1024L * 1024L) return "too_large_for_caching_" + this.size();
// check status code
if (!validResponseStatus()) {
@ -265,10 +277,15 @@ public class Response {
if (this.url().isPOST() && !this.profile.crawlingQ()) {
return "dynamic_post";
}
if (this.url().isCGI()) {
return "dynamic_cgi";
}
if (this.url().isLocal()) {
return "local_URL_no_cache_needed";
}
if (requestHeader != null) {
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
@ -338,7 +355,7 @@ public class Response {
*
* @return whether the file should be taken from the cache
*/
public boolean shallUseCacheForProxy() {
public boolean isFreshForProxy() {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
@ -488,7 +505,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for proxy)";
return "indexing not allowed - indexText and indexMedia not set (for proxy = " + profile.name()+ ")";
}
// -CGI access in request
@ -629,7 +646,7 @@ public class Response {
// check profile
if (!profile().indexText() && !profile().indexMedia()) {
return "indexing not allowed - indexText and indexMedia not set (for crawler)";
return "indexing not allowed - indexText and indexMedia not set (for crawler = " + profile.name()+ ")";
}
// -CGI access in request

@ -329,6 +329,7 @@ public class SitemapParser extends DefaultHandler {
// remote Indexing disabled
false,
// exclude stop-words
true, true, true);
true, true, true,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
}

@ -170,9 +170,17 @@ public class bookmarksDB {
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
Boolean.parseBoolean(parser[12])
);
Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH
);
}
if (parser.length == 14) {
folderReCrawl(Long.parseLong(parser[0]), parser[1], parser[2], Integer.parseInt(parser[3]), Long.parseLong(parser[4]),
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13])
);
}
}
}
in.close();
@ -204,9 +212,9 @@ public class bookmarksDB {
return true;
}
public void folderReCrawl (long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder,
public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder,
int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
boolean crawlOrder, boolean xsstopw, boolean storeHTCache) {
boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) {
Switchboard sb = Switchboard.getSwitchboard();
Iterator<String> bit=getBookmarksIterator(folder, true);
@ -261,7 +269,7 @@ public class bookmarksDB {
sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cacheStrategy);
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash,
crawlingStartURL,

@ -63,7 +63,7 @@ public final class Cache {
private static long maxCacheSize = 0l;
private static File cachePath = null;
private static String prefix;
private static final Log log = new Log("HTCACHE");
public static final Log log = new Log("HTCACHE");
public static void init(final File htCachePath, String peerSalt, final long CacheSizeMax) {
@ -103,31 +103,39 @@ public final class Cache {
fileDBunbuffered.setMaxSize(maxCacheSize);
}
/**
* close the databases
*/
public static void close() {
responseHeaderDB.close();
fileDB.close(true);
}
// Store to Cache
public static void storeMetadata(final yacyURL url, final ResponseHeader responseHeader) {
if (responseHeader != null) try {
public static void store(yacyURL url, final ResponseHeader responseHeader, byte[] file) {
if (responseHeader != null && file != null) try {
// store the response header into the header database
final HashMap<String, String> hm = new HashMap<String, String>();
hm.putAll(responseHeader);
hm.put("@@URL", url.toNormalform(true, false));
responseHeaderDB.put(url.hash(), hm);
} catch (final Exception e) {
log.logWarning("could not write ResourceInfo: "
+ e.getClass() + ": " + e.getMessage());
fileDB.put(url.hash().getBytes("UTF-8"), file);
if (log.isFine()) log.logFine("stored in cache: " + url.toNormalform(true, false));
} catch (IOException e) {
e.printStackTrace();
}
}
public static void storeFile(yacyURL url, byte[] file) {
/**
* check if the responseHeaderDB and the fileDB has an entry for the given url
* @param url the url of the resource
* @return true if the content of the url is in the cache, false othervise
*/
public static boolean has(final yacyURL url) {
try {
fileDB.put(url.hash().getBytes("UTF-8"), file);
return responseHeaderDB.has(url.hash()) && fileDB.has(url.hash().getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
@ -140,7 +148,7 @@ public final class Cache {
* @throws <b>UnsupportedProtocolException</b> if the protocol is not supported and therefore the
* info object couldn't be created
*/
public static ResponseHeader loadResponseHeader(final yacyURL url) {
public static ResponseHeader getResponseHeader(final yacyURL url) {
// loading data from database
Map<String, String> hdb;
@ -161,14 +169,21 @@ public final class Cache {
* is available or the cached file is not readable, <code>null</code>
* is returned.
*/
public static InputStream getResourceContentStream(final yacyURL url) {
public static InputStream getContentStream(final yacyURL url) {
// load the url as resource from the cache
byte[] b = getResourceContent(url);
byte[] b = getContent(url);
if (b == null) return null;
return new ByteArrayInputStream(b);
}
public static byte[] getResourceContent(final yacyURL url) {
/**
* Returns the content of a cached resource as byte[]
* @param url the requested resource
* @return the resource content as byte[]. In no data
* is available or the cached file is not readable, <code>null</code>
* is returned.
*/
public static byte[] getContent(final yacyURL url) {
// load the url as resource from the cache
try {
return fileDB.get(url.hash().getBytes("UTF-8"));
@ -178,8 +193,24 @@ public final class Cache {
}
}
/**
* requesting the content length of a resource is discouraged since it may
* be performed by loading of the resource from the cache and then measuring the
* size after decompression of the content. This may use a lot of CPU resources
* and maybe cause also high IO. Please omit usage of this method as much as possible.
* @param url
* @return the size of the cached content
*/
public static long getResourceContentLength(final yacyURL url) {
// load the url as resource from the cache
// first try to get the length from the response header,
// this is less costly than loading the content from its gzipped cache
ResponseHeader responseHeader = getResponseHeader(url);
if (responseHeader != null) {
long length = responseHeader.getContentLength();
if (length > 0) return length;
}
// load the url as resource from the cache (possibly decompress it),
// and get the length from the content array size
try {
return fileDB.length(url.hash().getBytes("UTF-8"));
} catch (IOException e) {
@ -188,7 +219,12 @@ public final class Cache {
}
}
public static void deleteFromCache(yacyURL url) throws IOException {
/**
* removed response header and cached content from the database
* @param url
* @throws IOException
*/
public static void delete(yacyURL url) throws IOException {
responseHeaderDB.remove(url.hash());
fileDB.remove(url.hash().getBytes("UTF-8"));
}

@ -36,9 +36,24 @@ public class MultiOutputStream extends OutputStream {
*/
@Override
public void write(int b) throws IOException {
for(OutputStream stream: streams) {
for (OutputStream stream: streams) {
stream.write(b);
}
}
/**
* writes the byte[] to each of the streams
* overriding this high-level method causes less overhead
* than overriding only the low-level write method:
* it causes (a large number) less 'for' loops
*
* @see java.io.OutputStream#write(int)
*/
@Override
public void write(byte[] b, int start, int len) throws IOException {
for (OutputStream stream: streams) {
stream.write(b, start, len);
}
}
}

@ -358,7 +358,7 @@ public final class HTTPDProxyHandler {
// handle outgoing cookies
handleOutgoingCookies(requestHeader, host, ip);
prepareRequestHeader(conProp, requestHeader, hostlow);
ResponseHeader cachedResponseHeader = Cache.loadResponseHeader(url);
ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url);
// why are files unzipped upon arrival? why not zip all files in cache?
// This follows from the following premises
@ -404,10 +404,8 @@ public final class HTTPDProxyHandler {
"200 OK",
sb.crawler.defaultProxyProfile
);
//Cache.storeMetadata(cachedResponseHeader, response); // TODO: check if this storeMetadata is necessary
byte[] cacheContent = Cache.getResourceContent(url);
if (cacheContent != null && response.shallUseCacheForProxy()) {
byte[] cacheContent = Cache.getContent(url);
if (cacheContent != null && response.isFreshForProxy()) {
if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from cache");
fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
} else {
@ -502,7 +500,7 @@ public final class HTTPDProxyHandler {
if (cachedResponseHeader != null) {
// delete the cache
sizeBeforeDelete = Cache.getResourceContentLength(url);
Cache.deleteFromCache(url);
Cache.delete(url);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
}
@ -518,14 +516,7 @@ public final class HTTPDProxyHandler {
0,
0,
0);
final Response response = new Response(
request,
requestHeader,
responseHeader,
res.getStatusLine(),
sb.crawler.defaultProxyProfile
);
Cache.storeMetadata(request.url(), responseHeader);
// handle incoming cookies
handleIncomingCookies(responseHeader, host, ip);
@ -549,8 +540,14 @@ public final class HTTPDProxyHandler {
if (hasBody(res.getStatusCode())) {
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
final String storeError = response.shallStoreCacheForProxy();
final Response response = new Response(
request,
requestHeader,
responseHeader,
res.getStatusLine(),
sb.crawler.defaultProxyProfile
);
final String storeError = response.shallStoreCache();
final boolean storeHTCache = response.profile().storeHTCache();
final String supportError = Parser.supports(response.url(), response.getMimeType());
if (
@ -582,22 +579,21 @@ public final class HTTPDProxyHandler {
if (sizeBeforeDelete == -1) {
// totally fresh file
//cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
response.setContent(cacheArray);
sb.htEntryStoreProcess(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS");
Cache.store(response.url(), response.getResponseHeader(), cacheArray);
sb.toIndexer(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_MISS");
} else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) {
// before we came here we deleted a cache entry
cacheArray = null;
//cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_BAD;
//cacheManager.push(cacheEntry); // unnecessary update
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REF_FAIL_HIT");
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REF_FAIL_HIT");
} else {
// before we came here we deleted a cache entry
//cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
response.setContent(cacheArray);
sb.htEntryStoreProcess(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS");
Cache.store(response.url(), response.getResponseHeader(), cacheArray);
sb.toIndexer(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE, "TCP_REFRESH_MISS");
}
} else {
// no caching

@ -150,7 +150,7 @@ public class Compressor implements BLOB {
return null;
}
} else if (ByteArray.equals(b, plainMagic)) {
System.out.print("-"); // DEBUG
//System.out.print("-"); // DEBUG
byte[] r = new byte[b.length - 2];
System.arraycopy(b, 2, r, 0, b.length - 2);
return r;

@ -344,8 +344,8 @@ public class SnippetCache {
return new TextSnippet(url, loc, SOURCE_METADATA, null, null, faviconCache.get(url.hash()));
} else {
// trying to load the resource from the cache
resContent = Cache.getResourceContentStream(url);
responseHeader = Cache.loadResponseHeader(url);
resContent = Cache.getContentStream(url);
responseHeader = Cache.getResponseHeader(url);
if (resContent != null && ((resContentLength = Cache.getResourceContentLength(url)) > maxDocLen) && (!fetchOnline)) {
// content may be too large to be parsed here. To be fast, we omit calculation of snippet here
return new TextSnippet(url, null, ERROR_SOURCE_LOADING, queryhashes, "resource available, but too large: " + resContentLength + " bytes");
@ -353,12 +353,12 @@ public class SnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, true, reindexing);
final Response entry = Switchboard.getSwitchboard().loader.load(url, true, reindexing);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
// place entry on crawl queue
sb.htEntryStoreProcess(entry);
// place entry on indexing queue
sb.toIndexer(entry);
// read resource body (if it is there)
final byte []resourceArray = entry.getContent();
@ -366,7 +366,7 @@ public class SnippetCache {
resContent = new ByteArrayInputStream(resourceArray);
resContentLength = resourceArray.length;
} else {
resContent = Cache.getResourceContentStream(url);
resContent = Cache.getContentStream(url);
resContentLength = Cache.getResourceContentLength(url);
}
}
@ -456,8 +456,8 @@ public class SnippetCache {
ResponseHeader responseHeader = null;
try {
// trying to load the resource from the cache
resContent = Cache.getResourceContentStream(url);
responseHeader = Cache.loadResponseHeader(url);
resContent = Cache.getContentStream(url);
responseHeader = Cache.getResponseHeader(url);
if (resContent != null) {
// if the content was found
resContentLength = Cache.getResourceContentLength(url);
@ -465,7 +465,7 @@ public class SnippetCache {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, global);
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -476,7 +476,7 @@ public class SnippetCache {
resContent = new ByteArrayInputStream(resourceArray);
resContentLength = resourceArray.length;
} else {
resContent = Cache.getResourceContentStream(url);
resContent = Cache.getContentStream(url);
resContentLength = Cache.getResourceContentLength(url);
}
}
@ -844,7 +844,7 @@ public class SnippetCache {
if (responseHeader == null) {
// try to get the header from the htcache directory
try {
responseHeader = Cache.loadResponseHeader(url);
responseHeader = Cache.getResponseHeader(url);
} catch (final Exception e) {
// ignore this. resource info loading failed
}
@ -897,14 +897,14 @@ public class SnippetCache {
long contentLength = -1;
// trying to load the resource body from cache
InputStream resource = Cache.getResourceContentStream(url);
InputStream resource = Cache.getContentStream(url);
if (resource != null) {
contentLength = Cache.getResourceContentLength(url);
} else if (fetchOnline) {
// if the content is not available in cache try to download it from web
// try to download the resource using a crawler
final Response entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(url, forText, reindexing);
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, reindexing);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -912,7 +912,7 @@ public class SnippetCache {
// in case that the resource was not in ram, read it from disk
if (resourceArray == null) {
resource = Cache.getResourceContentStream(url);
resource = Cache.getContentStream(url);
contentLength = Cache.getResourceContentLength(url);
} else {
resource = new ByteArrayInputStream(resourceArray);

@ -230,6 +230,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
public File surrogatesOutPath;
public Map<String, String> rankingPermissions;
public Segment indexSegment;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
public CrawlQueues crawlQueues;
public ResultURLs crawlResults;
@ -514,6 +515,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// start a loader
log.logConfig("Starting Crawl Loader");
this.loader = new LoaderDispatcher(this);
this.crawlQueues = new CrawlQueues(this, queuesRoot);
this.crawlQueues.noticeURL.setMinimumDelta(
this.getConfigLong("minimumLocalDelta", this.crawlQueues.noticeURL.getMinimumLocalDelta()),
@ -1092,90 +1094,6 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
return this.crawler.cleanProfiles();
}
public boolean htEntryStoreProcess(final Response entry) {
if (entry == null) return false;
/* =========================================================================
* PARSER SUPPORT
*
* Testing if the content type is supported by the available parsers
* ========================================================================= */
final String supportError = Parser.supports(entry.url(), entry.getMimeType());
if (log.isFinest()) log.logFinest("STORE "+ entry.url() +" content of type "+ entry.getMimeType() + " is supported: " + supportError);
/* =========================================================================
* INDEX CONTROL HEADER
*
* With the X-YACY-Index-Control header set to "no-index" a client could disallow
* yacy to index the response returned as answer to a request
* ========================================================================= */
boolean doIndexing = true;
if (entry.requestProhibitsIndexing()) {
doIndexing = false;
if (this.log.isFine()) this.log.logFine("Crawling of " + entry.url() + " prohibited by request.");
}
/* =========================================================================
* LOCAL IP ADDRESS CHECK
*
* check if ip is local ip address // TODO: remove this procotol specific code here
* ========================================================================= */
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(entry.url());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logFine("Rejected URL '" + entry.url() + "': " + urlRejectReason);
doIndexing = false;
}
/* =========================================================================
* STORING DATA
*
* Now we store the response header and response content if
* a) the user has configured to use the htcache or
* b) the content should be indexed
* ========================================================================= */
if (((entry.profile() != null) && (entry.profile().storeHTCache())) || (doIndexing && supportError == null)) {
// store response header
/*
if (entry.writeResourceInfo()) {
this.log.logInfo("WROTE HEADER for " + entry.cacheFile());
}
*/
// work off unwritten files
if (entry.getContent() != null) {
final String error = (entry.initiator() == null) ? entry.shallStoreCacheForProxy() : null;
if (error == null) {
Cache.storeFile(entry.url(), entry.getContent());
if (this.log.isFine()) this.log.logFine("WROTE FILE (" + entry.getContent().length + " bytes) for " + entry.url());
} else {
if (this.log.isWarning()) this.log.logWarning("WRITE OF FILE " + entry.url() + " FORBIDDEN: " + error);
}
//} else {
//this.log.logFine("EXISTING FILE (" + entry.cacheFile.length() + " bytes) for " + entry.cacheFile);
}
}
/* =========================================================================
* INDEXING
* ========================================================================= */
if (doIndexing && supportError == null) {
// enqueue for further crawling
enQueue(entry);
} else {
if (!entry.profile().storeHTCache()) {
try {
Cache.deleteFromCache(entry.url());
} catch (IOException e) {
e.printStackTrace();
}
}
}
return true;
}
public void close() {
log.logConfig("SWITCHBOARD SHUTDOWN STEP 1: sending termination signal to managed threads:");
serverProfiling.stopSystemProfiling();
@ -1215,44 +1133,65 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
log.logConfig("SWITCHBOARD SHUTDOWN TERMINATED");
}
public void enQueue(final Response queueEntry) {
assert queueEntry != null;
public boolean toIndexer(final Response response) {
assert response != null;
// get next queue entry and start a queue processing
if (queueEntry == null) {
if (response == null) {
if (this.log.isFine()) log.logFine("deQueue: queue entry is null");
return;
return false;
}
if (queueEntry.profile() == null) {
if (response.profile() == null) {
if (this.log.isFine()) log.logFine("deQueue: profile is null");
return;
return false;
}
// check if the document should be indexed
// check if the document should be indexed based on proxy/crawler rules
String noIndexReason = "unspecified indexing error";
if (queueEntry.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
if (response.processCase(peers.mySeed().hash) == SwitchboardConstants.PROCESSCASE_4_PROXY_LOAD) {
// proxy-load
noIndexReason = queueEntry.shallIndexCacheForProxy();
noIndexReason = response.shallIndexCacheForProxy();
} else {
// normal crawling
noIndexReason = queueEntry.shallIndexCacheForCrawler();
noIndexReason = response.shallIndexCacheForCrawler();
}
// check if the parser supports the mime type
if (noIndexReason == null) {
noIndexReason = Parser.supports(response.url(), response.getMimeType());
}
// check X-YACY-Index-Control
// With the X-YACY-Index-Control header set to "no-index" a client could disallow
// yacy to index the response returned as answer to a request
if (noIndexReason == null && response.requestProhibitsIndexing()) {
noIndexReason = "X-YACY-Index-Control header prohibits indexing";
}
// check accepted domain / localhost accesses
if (noIndexReason == null) {
noIndexReason = crawlStacker.urlInAcceptedDomain(response.url());
}
// in the noIndexReason is set, indexing is not allowed
if (noIndexReason != null) {
// this document should not be indexed. log cause and close queue
final yacyURL referrerURL = queueEntry.referrerURL();
if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + queueEntry.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), queueEntry.name(), noIndexReason);
// log cause and close queue
final yacyURL referrerURL = response.referrerURL();
if (log.isFine()) log.logFine("deQueue: not indexed any word in URL " + response.url() + "; cause: " + noIndexReason);
addURLtoErrorDB(response.url(), (referrerURL == null) ? "" : referrerURL.hash(), response.initiator(), response.name(), noIndexReason);
// finish this entry
return;
return false;
}
// put document into the concurrent processing queue
if (log.isFinest()) log.logFinest("deQueue: passing entry to indexing queue");
if (log.isFinest()) log.logFinest("deQueue: passing to indexing queue: " + response.url().toNormalform(true, false));
try {
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(queueEntry, null, null));
this.indexingDocumentProcessor.enQueue(new indexingQueueEntry(response, null, null));
return true;
} catch (InterruptedException e) {
e.printStackTrace();
return false;
}
}
@ -1649,7 +1588,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
try {
// parse the document
document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getResourceContent(entry.url()));
document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), Cache.getContent(entry.url()));
assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) {
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());

@ -77,12 +77,12 @@ public class ymageOSM {
return null;
}
System.out.println("*** DEBUG: fetching OSM tile: " + tileURL.toNormalform(true, true));
InputStream tileStream = Cache.getResourceContentStream(tileURL);
InputStream tileStream = Cache.getContentStream(tileURL);
if (tileStream == null) {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().crawlQueues.loadResourceFromWeb(tileURL, false, false);
entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false);
} catch (IOException e) {
Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
return null;

Loading…
Cancel
Save