code simplification

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6233 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 1d8d51075c
commit 4da9042e8a

@ -56,37 +56,20 @@ public class FTPLoader {
maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
protected Response createCacheEntry(final Request request, final String mimeType, final Date fileDate) {
if (request == null) return null;
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType);
Response metadata = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
Cache.storeMetadata(responseHeader, metadata);
return metadata;
}
/**
* Loads the entry from a ftp-server
*
* @param entry
* @param request
* @return
*/
public Response load(final Request entry) throws IOException {
public Response load(final Request request) throws IOException {
long start = System.currentTimeMillis();
final yacyURL entryUrl = entry.url();
final yacyURL entryUrl = request.url();
final String fullPath = getPath(entryUrl);
// the return value
Response htCache = null;
Response response = null;
// determine filename and path
String file, path;
@ -125,16 +108,27 @@ public class FTPLoader {
if (file.length() == 0) {
// directory -> get list of files
// create a htcache entry
htCache = createCacheEntry(entry, "text/html", new Date());
byte[] dirList = generateDirlist(ftpClient, entry, path);
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
Cache.storeMetadata(request.url(), responseHeader);
byte[] dirList = generateDirlist(ftpClient, request, path);
if (dirList == null) {
htCache = null;
response = null;
}
} else {
// file -> download
try {
htCache = getFile(ftpClient, entry);
response = getFile(ftpClient, request);
} catch (final Exception e) {
// add message to errorLog
(new PrintStream(berr)).print(e.getMessage());
@ -144,15 +138,15 @@ public class FTPLoader {
}
// pass the downloaded resource to the cache manager
if (berr.size() > 0 || htCache == null) {
if (berr.size() > 0 || response == null) {
// some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : "";
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "server download" + detail);
throw new IOException("FTPLoader: Unable to download URL " + entry.url().toString() + detail);
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "server download" + detail);
throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail);
}
Latency.update(entry.url().hash().substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return htCache;
Latency.update(request.url().hash().substring(6), request.url().getHost(), System.currentTimeMillis() - start);
return response;
}
/**
@ -211,26 +205,26 @@ public class FTPLoader {
/**
* @param ftpClient
* @param entry
* @param request
* @param htCache
* @param cacheFile
* @return
* @throws Exception
*/
private Response getFile(final ftpc ftpClient, final Request entry) throws Exception {
private Response getFile(final ftpc ftpClient, final Request request) throws Exception {
// determine the mimetype of the resource
final yacyURL entryUrl = entry.url();
final yacyURL entryUrl = request.url();
final String mimeType = Parser.mimeOf(entryUrl);
final String path = getPath(entryUrl);
// if the mimetype and file extension is supported we start to download
// the file
Response htCache = null;
Response response = null;
String supportError = Parser.supports(entryUrl, mimeType);
if (supportError != null) {
// reject file
log.logInfo("PARSER REJECTED URL " + entry.url().toString() + ": " + supportError);
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
log.logInfo("PARSER REJECTED URL " + request.url().toString() + ": " + supportError);
sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new Exception(supportError);
} else {
// abort the download if content is too long
@ -242,19 +236,30 @@ public class FTPLoader {
// determine the file date
final Date fileDate = ftpClient.entryDate(path);
// create a htcache entry
htCache = createCacheEntry(entry, mimeType, fileDate);
// create a cache entry
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType);
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()));
Cache.storeMetadata(request.url(), responseHeader);
// download the remote file
byte[] b = ftpClient.get(path);
htCache.setContent(b);
response.setContent(b);
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + entry.url().toString());
sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.newEntry(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
}
}
return htCache;
return response;
}
/**

@ -36,7 +36,6 @@ import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseContainer;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.search.Switchboard;
import de.anomic.yacy.yacyURL;
import de.anomic.yacy.logging.Log;
@ -73,27 +72,7 @@ public final class HTTPLoader {
// refreshing timeout value
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
/**
* @param entry
* @param requestDate
* @param requestHeader
* @param responseHeader
* @param responseStatus Status-Code SPACE Reason-Phrase
* @return
*/
protected Response createCacheEntry(final Request request, final Date requestDate, final RequestHeader requestHeader, final ResponseHeader responseHeader, final String responseStatus) {
Response metadata = new Response(
request,
requestHeader,
responseHeader,
responseStatus,
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())
);
Cache.storeMetadata(responseHeader, metadata);
return metadata;
}
}
public Response load(final Request entry) throws IOException {
long start = System.currentTimeMillis();
@ -102,158 +81,143 @@ public final class HTTPLoader {
return doc;
}
private Response load(final Request entry, final int retryCount) throws IOException {
private Response load(final Request request, final int retryCount) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
throw new IOException("Redirection counter exceeded for URL " + entry.url().toString() + ". Processing aborted.");
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection counter exceeded").store();
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
final Date requestDate = new Date(); // remember the time...
final String host = entry.url().getHost();
final String path = entry.url().getFile();
int port = entry.url().getPort();
final boolean ssl = entry.url().getProtocol().equals("https");
final String host = request.url().getHost();
final String path = request.url().getFile();
int port = request.url().getPort();
final boolean ssl = request.url().getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// if not the right file type then reject file
String supportError = Parser.supportsExtension(entry.url());
String supportError = Parser.supportsExtension(request.url());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG EXTENSION TYPE: " + supportError);
}
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "url in blacklist").store();
throw new IOException("CRAWLER Rejecting URL '" + entry.url().toString() + "'. URL is in blacklist.");
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "url in blacklist").store();
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
// take a file from the net
Response htCache = null;
Response response = null;
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE);
//try {
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
yacyURL refererURL = null;
if (entry.referrerhash() != null) refererURL = sb.getURL(entry.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
// HTTP-Client
final Client client = new Client(socketTimeout, requestHeader);
ResponseContainer res = null;
try {
// send request
res = client.GET(entry.url().toString(), maxFileSize);
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
yacyURL refererURL = null;
if (request.referrerhash() != null) refererURL = sb.getURL(request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE));
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET));
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// create a new cache entry
htCache = createCacheEntry(entry, requestDate, requestHeader, res.getResponseHeader(), res.getStatusLine());
// request has been placed and result has been returned. work off response
//try {
// if the response has not the right file type then reject file
supportError = Parser.supports(entry.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
}
// HTTP-Client
final Client client = new Client(socketTimeout, requestHeader);
ResponseContainer res = null;
try {
// send request
res = client.GET(request.url().toString(), maxFileSize);
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
/*
// check if the content length is allowed
long contentLength = res.getResponseHeader().getContentLength();
if (maxFileSize > 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (HEAD)");
}
*/
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// create a new cache entry
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle())
);
Cache.storeMetadata(request.url(), res.getResponseHeader());
// request has been placed and result has been returned. work off response
// if the response has not the right file type then reject file
supportError = Parser.supports(request.url(), res.getResponseHeader().mime());
if (supportError != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new IOException("REJECTED WRONG MIME TYPE: " + supportError);
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
long contentLength = responseBody.length;
// check length again in case it was not possible to get the length before loading
if (maxFileSize > 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
long contentLength = responseBody.length;
response.setContent(responseBody);
// check length again in case it was not possible to get the length before loading
if (maxFileSize > 0 && contentLength > maxFileSize) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("REJECTED URL " + entry.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
return response;
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(request.url(), redirectionUrlString);
htCache.setContent(responseBody);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
return htCache;
/*
} catch (final SocketException e) {
// this may happen if the client suddenly closes its connection
// maybe the user has stopped loading
// in that case, we are not responsible and just forget it
// but we clean the cache also, since it may be only partial
// and most possible corrupted
this.log.logSevere("CRAWLER LOADER ERROR1: with URL=" + entry.url().toString() + ": " + e.toString());
sb.crawlQueues.errorURL.newEntry(entry, sb.webIndex.seedDB.mySeed().hash, new Date(), 1, ErrorURL.DENIED_CONNECTION_ERROR);
htCache = null;
}*/
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "redirection header empy");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(entry.url(), redirectionUrlString);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + res.getStatusLine() + "') for URL " + entry.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + entry.url().toString() + " aborted because of server shutdown.");
}
// generating url hash
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + entry.url().toString() + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
entry.redirectURL(redirectionUrl);
return load(entry, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + entry.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "server shutdown");
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
}
// generating url hash
final String urlhash = redirectionUrl.hash();
// check if the url was already indexed
final String dbname = sb.urlExists(urlhash);
if (dbname != null) {
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "redirection to double content");
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file
sb.crawlQueues.errorURL.newEntry(request, sb.peers.mySeed().hash, new Date(), 1, "wrong http status code " + res.getStatusCode() + ")");
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
}
return htCache;
}
return response;
}
}

@ -29,8 +29,8 @@ package de.anomic.crawler.retrieval;
import java.util.Date;
import de.anomic.crawler.CrawlProfile;
import de.anomic.document.Classification;
import de.anomic.document.Parser;
import de.anomic.http.client.Cache;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.http.metadata.ResponseHeader;
@ -182,10 +182,6 @@ public class Response {
return doctype;
}
public String urlHash() {
return this.url().hash();
}
public Date lastModified() {
Date docDate = null;
@ -389,7 +385,7 @@ public class Response {
}
final String mimeType = getMimeType();
if (!Cache.isPicture(mimeType)) {
if (!Classification.isPictureMime(mimeType)) {
// -cookies in request
// unfortunately, we should reload in case of a cookie
// but we think that pictures can still be considered as fresh
@ -513,7 +509,7 @@ public class Response {
// we checked that in shallStoreCache
// a picture cannot be indexed
if (Cache.noIndexingURL(url())) {
if (Classification.isMediaExtension(url().getFileExtension())) {
return "Media_Content_(forbidden)";
}
@ -532,7 +528,7 @@ public class Response {
// a picture cannot be indexed
final String mimeType = responseHeader.mime();
if (Cache.isPicture(mimeType)) {
if (Classification.isPictureMime(mimeType)) {
return "Media_Content_(Picture)";
}
String parserError = Parser.supportsMime(mimeType);
@ -652,11 +648,11 @@ public class Response {
// a picture cannot be indexed
if (responseHeader != null) {
final String mimeType = responseHeader.mime();
if (Cache.isPicture(mimeType)) { return "Media_Content_(Picture)"; }
if (Classification.isPictureMime(mimeType)) { return "Media_Content_(Picture)"; }
String parserError = Parser.supportsMime(mimeType);
if (parserError != null) { return "Media_Content, parser error: " + parserError; }
}
if (Cache.noIndexingURL(url())) { return "Media_Content_(forbidden)"; }
if (Classification.isMediaExtension(url().getFileExtension())) { return "Media_Content_(forbidden)"; }
// -if-modified-since in request
// if the page is fresh at the very moment we can index it

@ -97,4 +97,9 @@ public class Classification {
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
public static boolean isPictureMime(final String mimeType) {
if (mimeType == null) return false;
return mimeType.toUpperCase().startsWith("IMAGE");
}
}

@ -41,8 +41,6 @@ import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import de.anomic.crawler.retrieval.Response;
import de.anomic.document.Classification;
import de.anomic.http.metadata.ResponseHeader;
import de.anomic.kelondro.blob.ArrayStack;
import de.anomic.kelondro.blob.Compressor;
@ -109,44 +107,15 @@ public final class Cache {
responseHeaderDB.close();
fileDB.close(true);
}
public static boolean isPicture(final String mimeType) {
if (mimeType == null) return false;
return mimeType.toUpperCase().startsWith("IMAGE");
}
public static boolean noIndexingURL(final yacyURL url) {
if (url == null) return false;
String urlString = url.toString().toLowerCase();
//http://www.yacy.net/getimage.php?image.png
int idx = urlString.indexOf("?");
if (idx > 0) urlString = urlString.substring(0,idx);
//http://www.yacy.net/getimage.php
idx = urlString.lastIndexOf(".");
if (idx > 0) urlString = urlString.substring(idx+1);
//php
return Classification.isMediaExtension(urlString);
}
// Store to Cache
public static void storeMetadata(
final ResponseHeader responseHeader,
Response metadata
) {
public static void storeMetadata(final yacyURL url, final ResponseHeader responseHeader) {
if (responseHeader != null) try {
// store the response header into the header database
final HashMap<String, String> hm = new HashMap<String, String>();
hm.putAll(responseHeader);
hm.put("@@URL", metadata.url().toNormalform(false, false));
hm.put("@@DEPTH", Integer.toString(metadata.depth()));
responseHeaderDB.put(metadata.urlHash(), hm);
hm.put("@@URL", url.toNormalform(true, false));
responseHeaderDB.put(url.hash(), hm);
} catch (final Exception e) {
log.logWarning("could not write ResourceInfo: "
+ e.getClass() + ": " + e.getMessage());

@ -71,20 +71,12 @@ public class ResponseContainer {
*/
public ResponseHeader getResponseHeader() {
final ResponseHeader responseHeader = new ResponseHeader();
for (final Header header : getHeaders()) {
for (final Header header : method.getResponseHeaders()) {
responseHeader.add(header.getName(), header.getValue());
}
return responseHeader;
}
/**
* @see org.apache.commons.httpclient.HttpMethod#getResponseHeaders()
* @return the headers
*/
private Header[] getHeaders() {
return method.getResponseHeaders();
}
/**
* @see org.apache.commons.httpclient.HttpMethod#getResponseBody()
* @return

@ -397,17 +397,17 @@ public final class HTTPDProxyHandler {
0,
0,
0);
final Response cacheEntry = new Response(
final Response response = new Response(
request,
requestHeader,
cachedResponseHeader,
"200 OK",
sb.crawler.defaultProxyProfile
);
Cache.storeMetadata(cachedResponseHeader, cacheEntry); // TODO: check if this storeMetadata is necessary
//Cache.storeMetadata(cachedResponseHeader, response); // TODO: check if this storeMetadata is necessary
byte[] cacheContent = Cache.getResourceContent(url);
if (cacheContent != null && cacheEntry.shallUseCacheForProxy()) {
if (cacheContent != null && response.shallUseCacheForProxy()) {
if (theLogger.isFinest()) theLogger.logFinest(reqID + " fulfill request from cache");
fulfillRequestFromCache(conProp, url, requestHeader, cachedResponseHeader, cacheContent, countedRespond);
} else {
@ -518,14 +518,14 @@ public final class HTTPDProxyHandler {
0,
0,
0);
final Response cacheEntry = new Response(
final Response response = new Response(
request,
requestHeader,
responseHeader,
res.getStatusLine(),
sb.crawler.defaultProxyProfile
);
Cache.storeMetadata(responseHeader, cacheEntry);
Cache.storeMetadata(request.url(), responseHeader);
// handle incoming cookies
handleIncomingCookies(responseHeader, host, ip);
@ -550,9 +550,9 @@ public final class HTTPDProxyHandler {
final OutputStream outStream = (gzippedOut != null) ? gzippedOut : ((chunkedOut != null)? chunkedOut : respond);
final String storeError = cacheEntry.shallStoreCacheForProxy();
final boolean storeHTCache = cacheEntry.profile().storeHTCache();
final String supportError = Parser.supports(cacheEntry.url(), cacheEntry.getMimeType());
final String storeError = response.shallStoreCacheForProxy();
final boolean storeHTCache = response.profile().storeHTCache();
final String supportError = Parser.supports(response.url(), response.getMimeType());
if (
/*
* Now we store the response into the htcache directory if
@ -583,8 +583,8 @@ public final class HTTPDProxyHandler {
if (sizeBeforeDelete == -1) {
// totally fresh file
//cacheEntry.status = plasmaHTCache.CACHE_FILL; // it's an insert
cacheEntry.setContent(cacheArray);
sb.htEntryStoreProcess(cacheEntry);
response.setContent(cacheArray);
sb.htEntryStoreProcess(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_MISS");
} else if (cacheArray != null && sizeBeforeDelete == cacheArray.length) {
// before we came here we deleted a cache entry
@ -595,8 +595,8 @@ public final class HTTPDProxyHandler {
} else {
// before we came here we deleted a cache entry
//cacheEntry.status = plasmaHTCache.CACHE_STALE_RELOAD_GOOD;
cacheEntry.setContent(cacheArray);
sb.htEntryStoreProcess(cacheEntry);
response.setContent(cacheArray);
sb.htEntryStoreProcess(response);
conProp.setProperty(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_MISS");
}
} else {

@ -1266,7 +1266,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
readerThread.start();
DCEntry surrogate;
Response queueentry;
Response response;
while ((surrogate = reader.take()) != DCEntry.poison) {
// check if url is in accepted domain
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url());
@ -1289,8 +1289,8 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
0,
0
);
queueentry = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(queueentry, document, null);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile);
indexingQueueEntry queueEntry = new indexingQueueEntry(response, document, null);
// place the queue entry into the concurrent process of the condenser (document analysis)
try {

Loading…
Cancel
Save