added option to record urls that are forwarded to the solr index

pull/1/head
Michael Peter Christen 13 years ago
parent d763e4d94b
commit 3fd4a01286

@ -73,7 +73,7 @@ scriptscount_i
## content of <meta name="robots" content=#content#> tag and the "X-Robots-Tag" HTTP property
robots_i
## html status return code (i.e. "200" for ok), -1 if not loaded, int
## html status return code (i.e. "200" for ok), -1 if not loaded (see content of failreason_t for this case), int
httpstatus_i
## content of <meta name="generator" content=#content#> tag, text
@ -259,7 +259,6 @@ iframesscount_i
## number of matching title expressions, textgen
#ext_title_val
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t

@ -700,6 +700,7 @@ crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=10485760
crawler.http.FollowRedirects=true
crawler.http.RecordRedirects=false
# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=10485760

@ -36,9 +36,9 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.solr.ShardSolrConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.services.federated.solr.SolrDoc;
import net.yacy.cora.services.federated.solr.ShardSolrConnector;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.Index;
@ -61,10 +61,17 @@ public class ZURL implements Iterable<ZURL.Entry> {
public enum FailCategory {
// TEMPORARY categories are such failure cases that should be tried again
// FINAL categories are such failure cases that are final and should not be tried again
TEMPORARY_NETWORK_FAILURE, // an entity could not been loaded
FINAL_PROCESS_CONTEXT, // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT, // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE; // a remote server denies indexing or loading
TEMPORARY_NETWORK_FAILURE(true), // an entity could not been loaded
FINAL_PROCESS_CONTEXT(false), // because of a processing context we do not want that url again (i.e. remote crawling)
FINAL_LOAD_CONTEXT(false), // the crawler configuration does not want to load the entity
FINAL_ROBOTS_RULE(true), // a remote server denies indexing or loading
FINAL_REDIRECT_RULE(true); // the remote server redirects this page, thus disallowing reading of content
public final boolean store;
private FailCategory(boolean store) {
this.store = store;
}
}
private final static Row rowdef = new Row(
@ -153,6 +160,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
String anycause,
final int httpcode) {
// assert executor != null; // null == proxy !
assert failCategory.store || httpcode == -1 : "failCategory=" + failCategory.name();
if (exists(bentry.url().hash())) return; // don't insert double causes
if (anycause == null) anycause = "unknown";
final String reason = anycause + ((httpcode >= 0) ? " (http return code = " + httpcode + ")" : "");
@ -160,7 +168,7 @@ public class ZURL implements Iterable<ZURL.Entry> {
put(entry);
this.stack.add(entry.hash());
Log.logInfo("Rejected URL", bentry.url().toNormalform(false, false) + " - " + reason);
if (this.solrConnector != null && (failCategory == FailCategory.TEMPORARY_NETWORK_FAILURE || failCategory == FailCategory.FINAL_ROBOTS_RULE)) {
if (this.solrConnector != null && failCategory.store) {
// send the error to solr
try {
SolrDoc errorDoc = this.solrConfiguration.err(bentry.url(), failCategory.name() + " " + reason, httpcode);

@ -79,8 +79,10 @@ public final class HTTPLoader {
private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException {
byte[] myHash = this.sb.peers.mySeed().hash.getBytes();
if (retryCount < 0) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1);
throw new IOException("retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
@ -96,7 +98,7 @@ public final class HTTPLoader {
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (checkBlacklist && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -131,60 +133,60 @@ public final class HTTPLoader {
// send request
final byte[] responseBody = client.GETbytes(url, maxFileSize);
final int code = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader responseHeader = new ResponseHeader(code, client.getHttpResponse().getAllHeaders());
final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
String requestURLString = request.url().toNormalform(false, false);
if (code > 299 && code < 310) {
// redirection (content may be empty)
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
if (responseHeader.containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
// check redirection
if (statusCode > 299 && statusCode < 310) {
if (redirectionUrlString.length() == 0) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection header empy", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
}
// read redirection URL
String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim();
// normalizing URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
if (redirectionUrlString.length() == 0) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
// normalize URL
final DigestURI redirectionUrl = new DigestURI(MultiProtocolURI.newURL(request.url(), redirectionUrlString));
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
// restart crawling with new url
this.log.logInfo("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + request.url().toString());
this.log.logInfo("CRAWLER ..Redirecting request to: " + redirectionUrl);
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode);
}
if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", code);
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode);
throw new IOException("CRAWLER Retry of URL=" + requestURLString + " aborted because of server shutdown.");
}
// check if the url was already indexed
final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
if (dbname != null) { //OTTO
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
if (dbname != null) { // customer request
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", statusCode);
throw new IOException("CRAWLER Redirection of URL=" + requestURLString + " ignored. The url appears already in db " + dbname);
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1, maxFileSize, checkBlacklist);
} else {
// no redirection url provided
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided", code);
throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
}
} else {
// we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", code);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);
throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
} else if (responseBody == null) {
// no response, reject file
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", code);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
} else if (code == 200 || code == 203) {
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode);
throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
} else if (statusCode == 200 || statusCode == 203) {
// the transfer is ok
// we write the new cache entry to file system directly
@ -193,7 +195,7 @@ public final class HTTPLoader {
// check length again in case it was not possible to get the length before loading
if (maxFileSize > 0 && contentLength > maxFileSize) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", code);
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode);
throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)");
}
@ -211,8 +213,8 @@ public final class HTTPLoader {
return response;
} else {
// if the response has not the right response type then reject file
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", code);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString());
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode);
throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + requestURLString);
}
}

@ -296,7 +296,8 @@ public final class SwitchboardConstants {
* <p>Name of the setting how many active crawler-threads may maximal be running on the same time</p>
*/
public static final String CRAWLER_THREADS_ACTIVE_MAX = "crawler.MaxActiveThreads";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects";
public static final String CRAWLER_FOLLOW_REDIRECTS = "crawler.http.FollowRedirects"; // ignore the target url and follow to the redirect
public static final String CRAWLER_RECORD_REDIRECTS = "crawler.http.RecordRedirects"; // record the ignored redirected page to the index store
public static final String YACY_MODE_DEBUG = "yacyDebugMode";
/**

@ -102,14 +102,14 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.localSolr = new EmbeddedSolrConnector(solrLocation, new File(new File(Switchboard.getSwitchboard().appPath,"defaults"), "solr"));
}
public SolrConnector getRemoteSolr() {
return this.remoteSolr;
}
public SolrConnector getLocalSolr() {
return this.localSolr;
}
public SolrConnector getRemoteSolr() {
return this.remoteSolr;
}
public void clearCache() {
if (this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache();
if (this.statsDump != null) this.statsDump.clear();
@ -137,8 +137,8 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.urlIndexFile.close();
this.urlIndexFile = null;
}
if (this.remoteSolr != null) this.remoteSolr.close();
if (this.localSolr != null) this.localSolr.close();
if (this.remoteSolr != null) this.remoteSolr.close();
}
public int writeCacheSize() {
@ -208,10 +208,23 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
if (MemoryControl.shortStatus()) clearCache() ;
}
public boolean remove(final byte[] urlHashBytes) {
if (urlHashBytes == null) return false;
public boolean remove(final byte[] urlHash) {
if (urlHash == null) return false;
if (this.localSolr != null || this.remoteSolr != null) {
String urls = ASCII.String(urlHash);
if (this.localSolr != null) try {
this.localSolr.delete(urls);
} catch (final Throwable e) {
Log.logException(e);
}
if (this.remoteSolr != null) try {
this.remoteSolr.delete(urls);
} catch (final Throwable e) {
Log.logException(e);
}
}
try {
final Row.Entry r = this.urlIndexFile.remove(urlHashBytes);
final Row.Entry r = this.urlIndexFile.remove(urlHash);
if (r != null) this.statsDump = null;
return r != null;
} catch (final IOException e) {
@ -221,11 +234,22 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
public boolean exists(final byte[] urlHash) {
if (urlHash == null) return false;
try {
if (this.remoteSolr != null && this.remoteSolr.exists(ASCII.String(urlHash))) {
return true;
if (this.localSolr != null || this.remoteSolr != null) {
String urls = ASCII.String(urlHash);
try {
if (this.localSolr != null && this.localSolr.exists(urls)) {
return true;
}
} catch (final Throwable e) {
Log.logException(e);
}
try {
if (this.remoteSolr != null && this.remoteSolr.exists(urls)) {
return true;
}
} catch (final Throwable e) {
Log.logException(e);
}
} catch (final Throwable e) {
}
if (this.urlIndexFile == null) return false; // case may happen during shutdown
return this.urlIndexFile.has(urlHash);

Loading…
Cancel
Save