added a "fromCache" flag in Response object to omit one cache.has()

check during snippet generation. This should cause less blockings
pull/1/head
Michael Peter Christen 13 years ago
parent 81737dcb18
commit 7e0ddbd275

@ -134,6 +134,7 @@ public class FTPLoader {
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
dirList.toString().getBytes()); dirList.toString().getBytes());
} }
} else { } else {
@ -253,6 +254,7 @@ public class FTPLoader {
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
null); null);
return response; return response;
} }
@ -268,6 +270,7 @@ public class FTPLoader {
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
b); b);
return response; return response;
} }

@ -30,8 +30,6 @@ import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import de.anomic.crawler.CrawlProfile;
import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -43,6 +41,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments; import net.yacy.search.index.Segments;
import de.anomic.crawler.CrawlProfile;
public class FileLoader { public class FileLoader {
@ -53,7 +52,7 @@ public class FileLoader {
public FileLoader(final Switchboard sb, final Log log) { public FileLoader(final Switchboard sb, final Log log) {
this.sb = sb; this.sb = sb;
this.log = log; this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l);
} }
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
@ -62,7 +61,7 @@ public class FileLoader {
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) { if (request.referrerhash() != null) {
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
} }
@ -89,13 +88,14 @@ public class FileLoader {
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
content.toString().getBytes()); content.toString().getBytes());
return response; return response;
@ -117,25 +117,26 @@ public class FileLoader {
} }
String parserError = null; String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) { (size > this.maxFileSize && this.maxFileSize >= 0)) {
// we know that we cannot process that file before loading // we know that we cannot process that file before loading
// only the metadata is returned // only the metadata is returned
if (parserError != null) { if (parserError != null) {
log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); this.log.logInfo("No parser available in File crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else { } else {
log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); this.log.logInfo("Too big file in File crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
} }
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
url.toTokens().getBytes()); url.toTokens().getBytes());
return response; return response;
} }
@ -146,13 +147,14 @@ public class FileLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
b); b);
return response; return response;
} }

@ -205,6 +205,7 @@ public final class HTTPLoader {
header, header,
Integer.toString(code), Integer.toString(code),
profile, profile,
false,
responseBody responseBody
); );
@ -273,6 +274,7 @@ public final class HTTPLoader {
header, header,
Integer.toString(code), Integer.toString(code),
null, null,
false,
responseBody responseBody
); );

@ -66,6 +66,7 @@ public class Response {
private final CrawlProfile profile; private final CrawlProfile profile;
private byte[] content; private byte[] content;
private int status; // tracker indexing status, see status defs below private int status; // tracker indexing status, see status defs below
private final boolean fromCache;
// doctype calculation // doctype calculation
public static char docType(final MultiProtocolURI url) { public static char docType(final MultiProtocolURI url) {
@ -151,6 +152,7 @@ public class Response {
final ResponseHeader responseHeader, final ResponseHeader responseHeader,
final String responseStatus, final String responseStatus,
final CrawlProfile profile, final CrawlProfile profile,
final boolean fromCache,
final byte[] content) { final byte[] content) {
this.request = request; this.request = request;
// request and response headers may be zero in case that we process surrogates // request and response headers may be zero in case that we process surrogates
@ -160,6 +162,7 @@ public class Response {
this.profile = profile; this.profile = profile;
this.status = QUEUE_STATE_FRESH; this.status = QUEUE_STATE_FRESH;
this.content = content; this.content = content;
this.fromCache = fromCache;
} }
/** /**
@ -179,6 +182,7 @@ public class Response {
this.profile = profile; this.profile = profile;
this.status = QUEUE_STATE_FRESH; this.status = QUEUE_STATE_FRESH;
this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes(); this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
this.fromCache = true;
} }
public Response( public Response(
@ -186,8 +190,9 @@ public class Response {
final RequestHeader requestHeader, final RequestHeader requestHeader,
final ResponseHeader responseHeader, final ResponseHeader responseHeader,
final String responseStatus, final String responseStatus,
final CrawlProfile profile) { final CrawlProfile profile,
this(request, requestHeader, responseHeader, responseStatus, profile, null); final boolean fromCache) {
this(request, requestHeader, responseHeader, responseStatus, profile, fromCache, null);
} }
public void updateStatus(final int newStatus) { public void updateStatus(final int newStatus) {
@ -198,6 +203,10 @@ public class Response {
return this.responseHeader; return this.responseHeader;
} }
public boolean fromCache() {
return this.fromCache;
}
public int getStatus() { public int getStatus() {
return this.status; return this.status;
} }

@ -38,9 +38,6 @@ import java.util.List;
import jcifs.smb.SmbException; import jcifs.smb.SmbException;
import jcifs.smb.SmbFile; import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileInputStream; import jcifs.smb.SmbFileInputStream;
import de.anomic.crawler.CrawlProfile;
import net.yacy.cora.document.Classification; import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
@ -53,6 +50,7 @@ import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard; import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments; import net.yacy.search.index.Segments;
import de.anomic.crawler.CrawlProfile;
public class SMBLoader { public class SMBLoader {
@ -65,7 +63,7 @@ public class SMBLoader {
public SMBLoader(final Switchboard sb, final Log log) { public SMBLoader(final Switchboard sb, final Log log) {
this.sb = sb; this.sb = sb;
this.log = log; this.log = log;
maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l); this.maxFileSize = sb.getConfigLong("crawler.smb.maxFileSize", -1l);
} }
@ -75,7 +73,7 @@ public class SMBLoader {
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) { if (request.referrerhash() != null) {
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); DigestURI ur = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
} }
@ -109,13 +107,14 @@ public class SMBLoader {
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date())); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
content.toString().getBytes()); content.toString().getBytes());
return response; return response;
@ -137,25 +136,26 @@ public class SMBLoader {
} }
String parserError = null; String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) { (size > this.maxFileSize && this.maxFileSize >= 0)) {
// we know that we cannot process that file before loading // we know that we cannot process that file before loading
// only the metadata is returned // only the metadata is returned
if (parserError != null) { if (parserError != null) {
log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata"); this.log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else { } else {
log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata"); this.log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
} }
// create response with metadata only // create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
url.toTokens().getBytes()); url.toTokens().getBytes());
return response; return response;
} }
@ -166,13 +166,14 @@ public class SMBLoader {
is.close(); is.close();
// create response with loaded content // create response with loaded content
final CrawlProfile profile = sb.crawler.getActive(request.profileHandle().getBytes()); final CrawlProfile profile = this.sb.crawler.getActive(request.profileHandle().getBytes());
Response response = new Response( Response response = new Response(
request, request,
requestHeader, requestHeader,
responseHeader, responseHeader,
"200", "200",
profile, profile,
false,
b); b);
return response; return response;
} }

@ -404,7 +404,8 @@ public final class HTTPDProxyHandler {
requestHeader, requestHeader,
cachedResponseHeader, cachedResponseHeader,
"200 OK", "200 OK",
sb.crawler.defaultProxyProfile sb.crawler.defaultProxyProfile,
false
); );
final byte[] cacheContent = Cache.getContent(url.hash()); final byte[] cacheContent = Cache.getContent(url.hash());
if (cacheContent != null && response.isFreshForProxy()) { if (cacheContent != null && response.isFreshForProxy()) {
@ -548,7 +549,8 @@ public final class HTTPDProxyHandler {
requestHeader, requestHeader,
responseHeader, responseHeader,
Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()), Integer.toString(client.getHttpResponse().getStatusLine().getStatusCode()),
sb.crawler.defaultProxyProfile sb.crawler.defaultProxyProfile,
false
); );
final String storeError = response.shallStoreCacheForProxy(); final String storeError = response.shallStoreCacheForProxy();
final boolean storeHTCache = response.profile().storeHTCache(); final boolean storeHTCache = response.profile().storeHTCache();

@ -186,12 +186,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
*/ */
public final byte[] hash() { public final byte[] hash() {
// in case that the object was initialized without a known url hash, compute it now // in case that the object was initialized without a known url hash, compute it now
if (this.hash == null) {
// we check the this.hash value twice to avoid synchronization where possible
synchronized (this.protocol) {
if (this.hash == null) this.hash = urlHashComputation(); if (this.hash == null) this.hash = urlHashComputation();
}
}
return this.hash; return this.hash;
} }
@ -376,11 +371,7 @@ public class DigestURI extends MultiProtocolURI implements Serializable {
@Override @Override
public final boolean isLocal() { public final boolean isLocal() {
if (this.isFile()) return true; if (this.isFile()) return true;
if (this.hash == null) synchronized (this.protocol) {
// this is synchronized because another thread may also call the same method in between
// that is the reason that this.hash is checked again
if (this.hash == null) this.hash = urlHashComputation(); if (this.hash == null) this.hash = urlHashComputation();
}
return domDomain(this.hash) == 7; return domDomain(this.hash) == 7;
} }

@ -218,6 +218,7 @@ public final class LoaderDispatcher {
cachedResponse, cachedResponse,
"200", "200",
crawlProfile, crawlProfile,
true,
content); content);
// check which caching strategy shall be used // check which caching strategy shall be used

@ -1794,7 +1794,7 @@ public final class Switchboard extends serverSwitch
0, 0,
0, 0,
0); 0);
response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile); response = new Response(request, null, null, "200", this.crawler.defaultSurrogateProfile, false);
final indexingQueueEntry queueEntry = final indexingQueueEntry queueEntry =
new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] { new indexingQueueEntry(Segments.Process.SURROGATES, response, new Document[] {
document document

@ -201,12 +201,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
removeMatchingHashes(row.dc_subject(), remainingHashes); removeMatchingHashes(row.dc_subject(), remainingHashes);
removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes); removeMatchingHashes(row.url().toNormalform(true, true).replace('-', ' '), remainingHashes);
boolean isInCache = de.anomic.crawler.Cache.has(url.hash());
if (remainingHashes.size() == 0) { if (remainingHashes.size() == 0) {
// the snippet is fully inside the metadata! // the snippet is fully inside the metadata!
if (isInCache) { if (de.anomic.crawler.Cache.has(url.hash())) {
// get the sentences from the cache // get the sentences from the cache
final Request request = loader.request(url, true, reindexing); final Request request = loader.request(url, true, reindexing);
Response response; Response response;
@ -261,7 +259,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return; return;
} }
if (!isInCache && response != null) { if (!response.fromCache()) {
// place entry on indexing queue // place entry on indexing queue
Switchboard.getSwitchboard().toIndexer(response); Switchboard.getSwitchboard().toIndexer(response);
this.resultStatus = ResultClass.SOURCE_WEB; this.resultStatus = ResultClass.SOURCE_WEB;

Loading…
Cancel
Save