From 9be36800a4dc0d2c816c47084e34146a6cef0177 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 20 Dec 2020 19:44:16 +0100 Subject: [PATCH 1/3] increased redirect depth by one this makes sense if one redirect replaces http with https and another replaces www subdomain by without (and vice versa) --- .../yacy/crawler/retrieval/HTTPLoader.java | 521 +++++++++--------- .../net/yacy/repository/LoaderDispatcher.java | 2 +- 2 files changed, 262 insertions(+), 261 deletions(-) diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 1ba2d793e..0703bc85e 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -85,8 +85,8 @@ public final class HTTPLoader { Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); return doc; } - - /** + + /** * Open an input stream on a requested HTTP resource. When the resource content size is small * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance. * @param request @@ -98,228 +98,231 @@ public final class HTTPLoader { * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @throws IOException when an error occurred */ - public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, - final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) - throws IOException { - if (retryCount < 0) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); - throw new IOException( - "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); - } - DigestURL url = request.url(); - - final String host = url.getHost(); - if (host == null || host.length() < 2) { - throw new IOException("host is not well-formed: '" + host + "'"); - } - final String path = url.getFile(); - int port = url.getPort(); - final boolean ssl = url.getProtocol().equals("https"); - if (port < 0) - port = (ssl) ? 443 : 80; - - // check if url is in blacklist - final String hostlow = host.toLowerCase(Locale.ROOT); - if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, - "url in blacklist", -1); - throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); - } - - // resolve yacy and yacyh domains - final AlternativeDomainNames yacyResolver = this.sb.peers; - if (yacyResolver != null) { - final String yAddress = yacyResolver.resolve(host); - if (yAddress != null) { - url = new DigestURL(url.getProtocol() + "://" + yAddress + path); - } - } - - // create a request header - final RequestHeader requestHeader = createRequestheader(request, agent); - - // HTTP-Client - final HTTPClient client = new HTTPClient(agent); - client.setRedirecting(false); // we want to handle redirection - // ourselves, so we don't index pages - // twice - client.setTimout(this.socketTimeout); - client.setHeader(requestHeader.entrySet()); - - // send request - client.GET(url, false); - final StatusLine statusline = client.getHttpResponse().getStatusLine(); - final int statusCode = statusline.getStatusCode(); - final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); - String requestURLString = request.url().toNormalform(true); - - // check redirection - if (statusCode > 299 && statusCode < 310) { - client.finish(); - - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, - responseHeader, requestURLString); - - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - // we have two use cases here: loading from a crawl or just - // loading the url. Check this: - if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { - // put redirect url on the crawler queue to repeat a - // double-check - /* We have to clone the request instance and not to modify directly its URL, - * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + public StreamResponse openInputStream( + final Request request, CrawlProfile profile, final int retryCount, + final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent + ) throws IOException { + if (retryCount < 0) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); + } + DigestURL url = request.url(); + + final String host = url.getHost(); + if (host == null || host.length() < 2) { + throw new IOException("host is not well-formed: '" + host + "'"); + } + final String path = url.getFile(); + int port = url.getPort(); + final boolean ssl = url.getProtocol().equals("https"); + if (port < 0) + port = (ssl) ? 443 : 80; + + // check if url is in blacklist + final String hostlow = host.toLowerCase(Locale.ROOT); + if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, + "url in blacklist", -1); + throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); + } + + // resolve yacy and yacyh domains + final AlternativeDomainNames yacyResolver = this.sb.peers; + if (yacyResolver != null) { + final String yAddress = yacyResolver.resolve(host); + if (yAddress != null) { + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); + } + } + + // create a request header + final RequestHeader requestHeader = createRequestheader(request, agent); + + // HTTP-Client + final HTTPClient client = new HTTPClient(agent); + client.setRedirecting(false); // we want to handle redirection + // ourselves, so we don't index pages + // twice + client.setTimout(this.socketTimeout); + client.setHeader(requestHeader.entrySet()); + + // send request + client.GET(url, false); + final StatusLine statusline = client.getHttpResponse().getStatusLine(); + final int statusCode = statusline.getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + String requestURLString = request.url().toNormalform(true); + + // check redirection + if (statusCode > 299 && statusCode < 310) { + client.finish(); + + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, statusline, responseHeader, requestURLString); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { + // we have two use cases here: loading from a crawl or just + // loading the url. Check this: + if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { + // put redirect url on the crawler queue to repeat a + // double-check + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ Request redirectedRequest = new Request(request.initiator(), - redirectionUrl, - request.referrerhash(), - request.name(), - request.appdate(), - request.profileHandle(), - request.depth(), - request.timezoneOffset()); - String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); - if(rejectReason != null) { + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); + if(rejectReason != null) { throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); - } - // in the end we must throw an exception (even if this is - // not an error, just to abort the current process - throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " - + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); - } - - // if we are already doing a shutdown we don't need to retry - // crawling - if (Thread.currentThread().isInterrupted()) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); - throw new IOException( - "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); - } - - // retry crawling with new url - request.redirectURL(redirectionUrl); - return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); - } - // we don't want to follow redirects - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); - throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline - + "' for URL '" + requestURLString + "'$"); - } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { - // the transfer is ok - - /* - * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local - */ - long contentLength = client.getHttpResponse().getEntity().getContentLength(); - InputStream contentStream; - if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { - byte[] content = null; - try { - content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); - Cache.store(url, responseHeader, content); - } catch (final IOException e) { - this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); - } finally { - client.finish(); - } - - contentStream = new ByteArrayInputStream(content); - } else { - /* - * Content length may already be known now : check it before opening a stream - */ - if (maxFileSize >= 0 && contentLength > maxFileSize) { - throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); - } - /* - * Create a HTTPInputStream delegating to - * client.getContentstream(). Close method will ensure client is - * properly closed. - */ - contentStream = new HTTPInputStream(client); - /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ - if(maxFileSize >= 0) { - contentStream = new StrictLimitInputStream(contentStream, maxFileSize, - "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); - } - } - - return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); - } else { - client.finish(); - // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); - throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline - + "' for URL '" + requestURLString + "'$"); - } - } - - /** - * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. - * @return redirect URL - * @throws IOException when an error occured - */ - private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, - final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) - throws IOException { - // read redirection URL - String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); - redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); - - if (redirectionUrlString.isEmpty()) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.TEMPORARY_NETWORK_FAILURE, - "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); - throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline - + "' for URL '" + requestURLString + "'$"); - } - - // normalize URL - final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); - - // restart crawling with new url - this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL " - + requestURLString); - this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); - - this.sb.webStructure.generateCitationReference(url, redirectionUrl); - - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, - FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode()); - } - return redirectionUrl; - } - - /** - * Create request header for loading content. - * @param request search request - * @param agent agent identification information - * @return a request header - * @throws IOException when an error occured - */ - private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) - throws IOException { - final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); - if (request.referrerhash() != null) { + } + // in the end we must throw an exception (even if this is + // not an error, just to abort the current process + throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + } + + // if we are already doing a shutdown we don't need to retry + // crawling + if (Thread.currentThread().isInterrupted()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + throw new IOException( + "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); + } + + // check if the redirected URL is the same as the requested URL + // this shortcuts a time-out using retryCount + if (redirectionUrl.equals(url)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirect to same url", -1); + throw new IOException( "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); + } + + // retry crawling with new url + request.redirectURL(redirectionUrl); + return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); + } + // we don't want to follow redirects + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$"); + } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { + // the transfer is ok + + /* + * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local + */ + long contentLength = client.getHttpResponse().getEntity().getContentLength(); + InputStream contentStream; + if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { + byte[] content = null; + try { + content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); + Cache.store(url, responseHeader, content); + } catch (final IOException e) { + this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); + } finally { + client.finish(); + } + + contentStream = new ByteArrayInputStream(content); + } else { + /* + * Content length may already be known now : check it before opening a stream + */ + if (maxFileSize >= 0 && contentLength > maxFileSize) { + throw new IOException("Content to download exceed maximum value of " + maxFileSize + " bytes"); + } + /* + * Create a HTTPInputStream delegating to + * client.getContentstream(). Close method will ensure client is + * properly closed. + */ + contentStream = new HTTPInputStream(client); + /* Anticipated content length may not be already known or incorrect : let's apply now the same eventual content size restriction as when loading in a byte array */ + if(maxFileSize >= 0) { + contentStream = new StrictLimitInputStream(contentStream, maxFileSize, + "Content to download exceed maximum value of " + Formatter.bytesToString(maxFileSize)); + } + } + + return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); + } else { + client.finish(); + // if the response has not the right response type then reject file + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + throw new IOException("REJECTED WRONG STATUS TYPE '" + statusline + + "' for URL '" + requestURLString + "'$"); + } + } + + /** + * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. + * @return redirect URL + * @throws IOException when an error occured + */ + private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, + final StatusLine statusline, final ResponseHeader responseHeader, String requestURLString) + throws IOException { + // read redirection URL + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); + redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); + + if (redirectionUrlString.isEmpty()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, + "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusline.getStatusCode()); + throw new IOException("REJECTED EMTPY REDIRECTION '" + statusline + + "' for URL '" + requestURLString + "'$"); + } + + // normalize URL + final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); + + // restart crawling with new url + this.log.info("CRAWLER Redirection detected ('" + statusline + "') for URL " + + requestURLString); + this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); + + this.sb.webStructure.generateCitationReference(url, redirectionUrl); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusline.getStatusCode()); + } + return redirectionUrl; + } + + /** + * Create request header for loading content. + * @param request search request + * @param agent agent identification information + * @return a request header + * @throws IOException when an error occured + */ + private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) + throws IOException { + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); + if (request.referrerhash() != null) { DigestURL refererURL = this.sb.getURL(request.referrerhash()); if (refererURL != null) { requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); } - } - - requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); - requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, - this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); - requestHeader.put(HeaderFramework.ACCEPT_CHARSET, - this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); - requestHeader.put(HeaderFramework.ACCEPT_ENCODING, - this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); - return requestHeader; - } + } + + requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); + requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, + this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); + requestHeader.put(HeaderFramework.ACCEPT_CHARSET, + this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, + this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); + return requestHeader; + } private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { @@ -347,10 +350,10 @@ public final class HTTPLoader { // resolve yacy and yacyh domains final AlternativeDomainNames yacyResolver = this.sb.peers; if(yacyResolver != null) { - final String yAddress = yacyResolver.resolve(host); - if(yAddress != null) { - url = new DigestURL(url.getProtocol() + "://" + yAddress + path); - } + final String yAddress = yacyResolver.resolve(host); + if(yAddress != null) { + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); + } } // take a file from the net @@ -366,41 +369,39 @@ public final class HTTPLoader { client.setHeader(requestHeader.entrySet()); // send request - final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); + final byte[] responseBody = client.GETbytes(url, sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"), sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_B64MD5, ""), maxFileSize, false); final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); String requestURLString = request.url().toNormalform(true); // check redirection - if (statusCode > 299 && statusCode < 310) { + if (statusCode > 299 && statusCode < 310) { - final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), - responseHeader, requestURLString); + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client.getHttpResponse().getStatusLine(), + responseHeader, requestURLString); - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { - // we have two use cases here: loading from a crawl or just loading the url. Check this: - if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { + // we have two use cases here: loading from a crawl or just loading the url. Check this: + if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { // put redirect url on the crawler queue to repeat a double-check - /* We have to clone the request instance and not to modify directly its URL, - * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ + /* We have to clone the request instance and not to modify directly its URL, + * otherwise the stackCrawl() function would reject it, because detecting it as already in the activeWorkerEntries */ Request redirectedRequest = new Request(request.initiator(), - redirectionUrl, - request.referrerhash(), - request.name(), - request.appdate(), - request.profileHandle(), - request.depth(), - request.timezoneOffset()); - String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); - // in the end we must throw an exception (even if this is not an error, just to abort the current process - if(rejectReason != null) { + redirectionUrl, + request.referrerhash(), + request.name(), + request.appdate(), + request.profileHandle(), + request.depth(), + request.timezoneOffset()); + String rejectReason = this.sb.crawlStacker.stackCrawl(redirectedRequest); + // in the end we must throw an exception (even if this is not an error, just to abort the current process + if(rejectReason != null) { throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " aborted. Reason : " + rejectReason); - } + } throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); - + } - } - // if we are already doing a shutdown we don't need to retry crawling if (Thread.currentThread().isInterrupted()) { this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); @@ -410,15 +411,15 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); return load(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); - } + } // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } else if (responseBody == null) { - // no response, reject file + // no response, reject file this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no response body", statusCode); throw new IOException("REJECTED EMPTY RESPONSE BODY '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); - } else if (statusCode == 200 || statusCode == 203) { + } else if (statusCode == 200 || statusCode == 203) { // the transfer is ok // we write the new cache entry to file system directly @@ -427,8 +428,8 @@ public final class HTTPLoader { // check length again in case it was not possible to get the length before loading if (maxFileSize >= 0 && contentLength > maxFileSize) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); - throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "file size limit exceeded", statusCode); + throw new IOException("REJECTED URL " + request.url() + " because file size '" + contentLength + "' exceeds max filesize limit of " + maxFileSize + " bytes. (GET)$"); } // create a new cache entry @@ -442,9 +443,9 @@ public final class HTTPLoader { ); return response; - } else { + } else { // if the response has not the right response type then reject file - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); } } @@ -485,17 +486,17 @@ public final class HTTPLoader { final HTTPClient client = new HTTPClient(agent); client.setTimout(20000); client.setHeader(requestHeader.entrySet()); - final byte[] responseBody = client.GETbytes(request.url(), null, null, false); + final byte[] responseBody = client.GETbytes(request.url(), null, null, false); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); - final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); + final ResponseHeader header = new ResponseHeader(code, client.getHttpResponse().getAllHeaders()); // FIXME: 30*-handling (bottom) is never reached // we always get the final content because httpClient.followRedirects = true - if (responseBody != null && (code == 200 || code == 203)) { + if (responseBody != null && (code == 200 || code == 203)) { // the transfer is ok - //statistics: - ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); + //statistics: + ByteCount.addAccountCount(ByteCount.CRAWLER, responseBody.length); // we write the new cache entry to file system directly @@ -513,7 +514,7 @@ public final class HTTPLoader { } else if (code > 299 && code < 310) { if (header.containsKey(HeaderFramework.LOCATION)) { // getting redirection URL - String redirectionUrlString = header.get(HeaderFramework.LOCATION); + String redirectionUrlString = header.get(HeaderFramework.LOCATION); redirectionUrlString = redirectionUrlString.trim(); if (redirectionUrlString.isEmpty()) { @@ -535,7 +536,7 @@ public final class HTTPLoader { } } else { // if the response has not the right response type then reject file - throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + "' for URL " + request.url().toString()); } return response; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 03feb047d..cccec50c3 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -399,7 +399,7 @@ public final class LoaderDispatcher { // load resource from the internet StreamResponse response; if (protocol.equals("http") || protocol.equals("https")) { - response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); + response = this.httpLoader.openInputStream(request, crawlProfile, 2, maxFileSize, blacklistType, agent); } else if (protocol.equals("ftp")) { response = this.ftpLoader.openInputStream(request, true); } else if (protocol.equals("smb")) { From 63f58e4785817e49d18653d71d7663ee11597589 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 20 Dec 2020 23:15:55 +0100 Subject: [PATCH 2/3] enhanced strategy in host browser limit number of fresh hosts in round robin hashes --- source/net/yacy/crawler/HostBalancer.java | 69 +++++++++++++---------- 1 file changed, 40 insertions(+), 29 deletions(-) diff --git a/source/net/yacy/crawler/HostBalancer.java b/source/net/yacy/crawler/HostBalancer.java index 6eaa377b8..be7fee676 100644 --- a/source/net/yacy/crawler/HostBalancer.java +++ b/source/net/yacy/crawler/HostBalancer.java @@ -65,7 +65,7 @@ public class HostBalancer implements Balancer { private final static ConcurrentLog log = new ConcurrentLog("HostBalancer"); public final static HandleMap depthCache = new RowHandleMap(Word.commonHashLength, Word.commonHashOrder, 2, 8 * 1024 * 1024, "HostBalancer.DepthCache"); - + private final File hostsPath; private final boolean exceed134217727; private final Map queues; @@ -84,7 +84,7 @@ public class HostBalancer implements Balancer { final boolean exceed134217727) { this(hostsPath, onDemandLimit, exceed134217727, true); } - + /** * Create a new instance and fills the queue by scanning the hostsPath directory. * @param hostsPath @@ -100,7 +100,7 @@ public class HostBalancer implements Balancer { this.hostsPath = hostsPath; this.onDemandLimit = onDemandLimit; this.exceed134217727 = exceed134217727; - + // create a stack for newly entered entries if (!(hostsPath.exists())) hostsPath.mkdirs(); // make the path this.queues = new ConcurrentHashMap(); @@ -114,7 +114,7 @@ public class HostBalancer implements Balancer { * return immediately (as large unfinished crawls may take longer to load) */ private void init(final boolean async) { - if(async) { + if(async) { Thread t = new Thread("HostBalancer.init") { @Override public void run() { @@ -122,10 +122,10 @@ public class HostBalancer implements Balancer { } }; - t.start(); - } else { - runInit(); - } + t.start(); + } else { + runInit(); + } } /** @@ -185,7 +185,7 @@ public class HostBalancer implements Balancer { } return c; } - + /** * delete all urls which are stored for given host hashes * @param hosthashes @@ -230,11 +230,11 @@ public class HostBalancer implements Balancer { return c; } - /** - * @return true when the URL is queued is this or any other HostBalancer - * instance (as {@link #depthCache} is shared between all HostBalancer - * instances) - */ + /** + * @return true when the URL is queued is this or any other HostBalancer + * instance (as {@link #depthCache} is shared between all HostBalancer + * instances) + */ @Override public boolean has(final byte[] urlhashb) { if (depthCache.has(urlhashb)) return true; @@ -313,7 +313,7 @@ public class HostBalancer implements Balancer { tryagain: while (true) try { HostQueue rhq = null; String rhh = null; - + synchronized (this) { if (this.roundRobinHostHashes.size() == 0) { // refresh the round-robin cache @@ -331,14 +331,21 @@ public class HostBalancer implements Balancer { if (size <= 10) {smallStacksExist = true; break smallsearch;} } } - if (singletonStacksExist || smallStacksExist) { - Iterator i = this.roundRobinHostHashes.iterator(); - smallstacks: while (i.hasNext()) { - if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left - String s = i.next(); - HostQueue hq = this.queues.get(s); - if (hq == null) {i.remove(); continue smallstacks;} - int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), s, robots, ClientIdentification.yacyInternetCrawlerAgent); + Set freshhosts = new HashSet<>(); + Iterator i = this.roundRobinHostHashes.iterator(); + smallstacks: while (i.hasNext()) { + if (this.roundRobinHostHashes.size() <= 10) break smallstacks; // don't shrink the hosts until nothing is left + String hosthash = i.next(); + HostQueue hq = this.queues.get(hosthash); + if (hq == null) {i.remove(); continue smallstacks;} + int delta = Latency.waitingRemainingGuessed(hq.getHost(), hq.getPort(), hosthash, robots, ClientIdentification.yacyInternetCrawlerAgent); + if (delta == Integer.MIN_VALUE) { + // never-crawled hosts; we do not want to have too many of them in here. Loading new hosts means: waiting for robots.txt to load + freshhosts.add(hosthash); + i.remove(); + continue smallstacks; + } + if (singletonStacksExist || smallStacksExist) { if (delta < 0) continue; // keep all non-waiting stacks; they are useful to speed up things // to protect all small stacks which have a fast throughput, remove all with long waiting time if (delta >= 1000) {i.remove(); continue smallstacks;} @@ -350,6 +357,10 @@ public class HostBalancer implements Balancer { } } } + // put at least one of the fresh hosts back + if (freshhosts.size() > 0) this.roundRobinHostHashes.add(freshhosts.iterator().next()); + + // result if (this.roundRobinHostHashes.size() == 1) { if (log.isFine()) log.fine("(re-)initialized the round-robin queue with one host"); } else { @@ -357,13 +368,13 @@ public class HostBalancer implements Balancer { } } if (this.roundRobinHostHashes.size() == 0) return null; - + // if the queue size is 1, just take that if (this.roundRobinHostHashes.size() == 1) { rhh = this.roundRobinHostHashes.iterator().next(); rhq = this.queues.get(rhh); } - + if (rhq == null) { // mixed minimum sleep time / largest queue strategy: // create a map of sleep time / queue relations with a fuzzy sleep time (ms / 500). @@ -449,7 +460,7 @@ public class HostBalancer implements Balancer { } */ } - + if (rhq == null) { this.roundRobinHostHashes.clear(); // force re-initialization continue tryagain; @@ -458,7 +469,7 @@ public class HostBalancer implements Balancer { long timestamp = System.currentTimeMillis(); Request request = rhq.pop(delay, cs, robots); // this pop is outside of synchronization to prevent blocking of pushes long actualwaiting = System.currentTimeMillis() - timestamp; - + if (actualwaiting > 1000) { synchronized (this) { // to prevent that this occurs again, remove all stacks with positive delay times (which may be less after that waiting) @@ -473,7 +484,7 @@ public class HostBalancer implements Balancer { } } } - + if (rhq.isEmpty()) { synchronized (this) { this.queues.remove(rhh); @@ -545,7 +556,7 @@ public class HostBalancer implements Balancer { @Override public List getDomainStackReferences(String host, int maxcount, long maxtime) { if (host == null) { - return Collections.emptyList(); + return Collections.emptyList(); } try { HostQueue hq = this.queues.get(DigestURL.hosthash(host, host.startsWith("ftp.") ? 21 : 80)); From 7997836506944adab5552b4b1f8be6339cfb0a7e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 20 Dec 2020 23:18:50 +0100 Subject: [PATCH 3/3] fixed lock image --- htroot/Network.html | 4 ++-- htroot/Status.html | 2 +- htroot/Surftips.html | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/htroot/Network.html b/htroot/Network.html index daf1305e6..1457720ed 100644 --- a/htroot/Network.html +++ b/htroot/Network.html @@ -133,7 +133,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window. #{list}# #[hash]# - #[shortname]##(ssl)#::https supported#(/ssl)# + #[shortname]##(ssl)#::https supported#(/ssl)# #(type)##(direct)#Junior passive::Junior direct::Junior offline#(/direct)#::#(direct)#senior passive::Senior direct::Senior offline#(/direct)#::#(direct)#Principal passive::Principal active::Principal offline#(/direct)##(/type)##(acceptcrawl)#no crawl::crawl possible::crawl possible#(/acceptcrawl)##(dhtreceive)#no DHT receive::DHT receive enabled::DHT receive enabled#(/dhtreceive)##{ips}##{/ips}# #[version]# @@ -249,7 +249,7 @@ document.getElementById("apilink").setAttribute("href", "Network.xml?" + window. QPH
(remote) - #[my-name]##(my-ssl)#::https supported#(/my-ssl)# + #[my-name]##(my-ssl)#::https supported#(/my-ssl)# #(my-info)#Virgin::Junior::Senior::Principal#(/my-info)##(my-acceptcrawl)#no crawl::Crawl enabled#(/my-acceptcrawl)##(my-dhtreceive)#no DHT receive::DHT Receive enabled#(/my-dhtreceive)##{ips}##{/ips}# #[my-version]# #[my-utc]# diff --git a/htroot/Status.html b/htroot/Status.html index d50b8b698..7702787c2 100644 --- a/htroot/Status.html +++ b/htroot/Status.html @@ -134,7 +134,7 @@ You can download a more recent version of YaCy. Click here to install this update and restart YaCy:
diff --git a/htroot/Surftips.html b/htroot/Surftips.html index 7d225c6d5..9e50a8e5e 100644 --- a/htroot/Surftips.html +++ b/htroot/Surftips.html @@ -57,12 +57,12 @@
#(publicSurftips)# :: #(/publicSurftips)#