diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index f264dd5a4..690b7d590 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -28,10 +28,13 @@ import java.awt.Image; import java.awt.MediaTracker; import java.awt.image.BufferedImage; import java.awt.image.Raster; -import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import java.util.Map; +import javax.imageio.ImageIO; +import javax.imageio.stream.ImageInputStream; + import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -42,11 +45,11 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.util.ConcurrentLog; import net.yacy.data.URLLicense; -import net.yacy.document.ImageParser; import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.peers.graphics.EncodedImage; import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -74,8 +77,8 @@ public class ViewImage { * when specified url is malformed, or a read/write error * occured, or input or target image format is not supported. * Sould end in a HTTP 500 error whose processing is more - * consistent across browsers than a response with zero - * content bytes. + * consistent across browsers than a response with zero content + * bytes. */ public static Object respond(final RequestHeader header, final serverObjects post, final serverSwitch env) throws IOException { @@ -113,39 +116,81 @@ public class ViewImage { if (image != null) { encodedImage = new EncodedImage(image, ext, post.getBoolean("isStatic")); } else { - byte[] resourceb = null; - if (url != null) - try { - String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName - : ClientIdentification.yacyInternetCrawlerAgentName); - ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); - resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, - BlacklistType.SEARCH, agent); - } catch (final IOException e) { - ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage()); - throw e; - } - boolean okToCache = true; - if (resourceb == null) { - /* - * Throw an exception, wich will end in a HTTP 500 response, - * better handled by browsers than an empty image - */ - throw new IOException("Image could not be loaded."); - } String urlExt = MultiProtocolURL.getFileExtension(url.getFileName()); if (ext != null && ext.equalsIgnoreCase(urlExt) && isBrowserRendered(urlExt)) { - return new ByteArrayInputStream(resourceb); + return openInputStream(post, sb.loader, auth, url); } - // read image - encodedImage = parseAndScale(post, auth, urlString, ext, okToCache, resourceb); + ImageInputStream imageInStream = null; + InputStream inStream = null; + /* + * When opening a file, the most efficient is to open + * ImageInputStream directly on file + */ + if (url.isFile()) { + imageInStream = ImageIO.createImageInputStream(url.getFSFile()); + } else { + inStream = openInputStream(post, sb.loader, auth, url); + imageInStream = ImageIO.createImageInputStream(inStream); + } + try { + // read image + encodedImage = parseAndScale(post, auth, urlString, ext, imageInStream); + } finally { + /* + * imageInStream.close() method doesn't close source input + * stream + */ + if (inStream != null) { + try { + inStream.close(); + } catch (IOException ignored) { + } + } + } } return encodedImage; } + /** + * Open input stream on image url using provided loader. All parameters must + * not be null. + * + * @param post + * post parameters. + * @param loader. + * Resources loader. + * @param auth + * true when user has credentials to load full images. + * @param url + * image url. + * @return an open input stream instance (don't forget to close it). + * @throws IOException + * when a read/write error occured. + */ + private static InputStream openInputStream(final serverObjects post, final LoaderDispatcher loader, + final boolean auth, DigestURL url) throws IOException { + InputStream inStream = null; + if (url != null) { + try { + String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName + : ClientIdentification.yacyInternetCrawlerAgentName); + ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); + inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST, + BlacklistType.SEARCH, agent); + } catch (final IOException e) { + ConcurrentLog.fine("ViewImage", "cannot load: " + e.getMessage()); + throw e; + } + } + if (inStream == null) { + throw new IOException("Input stream could no be open"); + } + return inStream; + } + /** * @param formatName * informal file format name. For example : "png". @@ -165,31 +210,35 @@ public class ViewImage { } /** - * Process resourceb byte array to try to produce an EncodedImage instance - * eventually scaled and cropped depending on post parameters. + * Process source image to try to produce an EncodedImage instance + * eventually scaled and clipped depending on post parameters. When + * processed, imageInStream is closed. * * @param post * request post parameters. Must not be null. * @param auth * true when access rigths are OK. * @param urlString - * image source URL. Must not be null. + * image source URL as String. Must not be null. * @param ext - * image file extension. May be null. - * @param okToCache - * true when image can be cached - * @param resourceb - * byte array. Must not be null. + * target image file format. May be null. + * @param imageInStream + * open stream on image content. Must not be null. * @return an EncodedImage instance. * @throws IOException * when image could not be parsed or encoded to specified format */ protected static EncodedImage parseAndScale(serverObjects post, boolean auth, String urlString, String ext, - boolean okToCache, byte[] resourceb) throws IOException { + ImageInputStream imageInStream) throws IOException { EncodedImage encodedImage = null; - Image image = ImageParser.parse(urlString, resourceb); + Image image = ImageIO.read(imageInStream); if (image == null) { + try { + /* When a null image is returned, we have to close the stream */ + imageInStream.close(); + } catch (IOException ignoredException) { + } /* * Throw an exception, wich will end in a HTTP 500 response, better * handled by browsers than an empty image @@ -197,54 +246,53 @@ public class ViewImage { throw new IOException("Image format is not supported."); } - if (image != null) { - int maxwidth = post.getInt("maxwidth", 0); - int maxheight = post.getInt("maxheight", 0); - final boolean quadratic = post.containsKey("quadratic"); - boolean isStatic = post.getBoolean("isStatic"); - if (!auth || maxwidth != 0 || maxheight != 0) { - - // find original size - int h = image.getHeight(null); - int w = image.getWidth(null); - - // in case of not-authorized access shrink the image to - // prevent - // copyright problems, so that images are not larger than - // thumbnails - Dimension maxDimensions = calculateMaxDimensions(auth, w, h, maxwidth, maxheight); - - // if a quadratic flag is set, we cut the image out to be in - // quadratic shape - if (quadratic && w != h) { - image = makeSquare(image, h, w); - h = image.getHeight(null); - w = image.getWidth(null); - } - - Dimension finalDimensions = calculateDimensions(w, h, maxDimensions); - - if (w != finalDimensions.width && h != finalDimensions.height) { - image = scale(finalDimensions.width, finalDimensions.height, image); + int maxwidth = post.getInt("maxwidth", 0); + int maxheight = post.getInt("maxheight", 0); + final boolean quadratic = post.containsKey("quadratic"); + boolean isStatic = post.getBoolean("isStatic"); + if (!auth || maxwidth != 0 || maxheight != 0) { + + // find original size + final int originWidth = image.getWidth(null); + final int originHeigth = image.getHeight(null); + + // in case of not-authorized access shrink the image to + // prevent + // copyright problems, so that images are not larger than + // thumbnails + Dimension maxDimensions = calculateMaxDimensions(auth, originWidth, originHeigth, maxwidth, maxheight); + + // if a quadratic flag is set, we cut the image out to be in + // quadratic shape + int w = originWidth; + int h = originHeigth; + if (quadratic && originWidth != originHeigth) { + image = makeSquare(image, originHeigth, originWidth); + h = image.getHeight(null); + w = image.getWidth(null); + } - } + Dimension finalDimensions = calculateDimensions(w, h, maxDimensions); - if ((finalDimensions.width == 16) && (finalDimensions.height == 16) && okToCache) { - // this might be a favicon, store image to cache for - // faster - // re-load later on - iconcache.put(urlString, image); - } + if (w != finalDimensions.width && h != finalDimensions.height) { + image = scale(finalDimensions.width, finalDimensions.height, image); } - /* - * An error can still occur when transcoding from buffered image to - * target ext : in that case return null - */ - encodedImage = new EncodedImage(image, ext, isStatic); - if (encodedImage.getImage().length() == 0) { - throw new IOException("Image could not be encoded to format : " + ext); + + if (finalDimensions.width == 16 && finalDimensions.height == 16) { + // this might be a favicon, store image to cache for + // faster + // re-load later on + iconcache.put(urlString, image); } } + /* + * An error can still occur when transcoding from buffered image to + * target ext : in that case return null + */ + encodedImage = new EncodedImage(image, ext, isStatic); + if (encodedImage.getImage().length() == 0) { + throw new IOException("Image could not be encoded to format : " + ext); + } return encodedImage; } diff --git a/source/net/yacy/cora/util/HTTPInputStream.java b/source/net/yacy/cora/util/HTTPInputStream.java new file mode 100755 index 000000000..035edfad9 --- /dev/null +++ b/source/net/yacy/cora/util/HTTPInputStream.java @@ -0,0 +1,125 @@ +/** + * HTTPInputStream + * Copyright 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany + * First published 26.11.2014 on http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.cora.util; + +import java.io.IOException; +import java.io.InputStream; + +import net.yacy.cora.protocol.http.HTTPClient; + +/** + * A HTTP InputStream delegating to HTTPClient. Use it when streaming HTTP content to easily finish HTTP client when closing stream. + * @author luc + * + */ +public class HTTPInputStream extends InputStream { + + /** HTTP client */ + private HTTPClient httpClient; + + /** Encapsulated HTTP content stream */ + private InputStream contentStream; + + + /** + * Constructs from a httpClient. + * @param httpClient a httpClient with accessible stream content. + * @throws IOException when content stream can not be open on httpClient + */ + public HTTPInputStream(HTTPClient httpClient) throws IOException { + if(httpClient == null) { + throw new IllegalArgumentException("httpClient is null"); + } + this.httpClient = httpClient; + this.contentStream = httpClient.getContentstream(); + if(this.contentStream == null) { + throw new IOException("content stream is null"); + } + } + + /** + * Close properly HTTP connection with httpClient + */ + @Override + public void close() throws IOException { + httpClient.finish(); + } + + + @Override + public int read() throws IOException { + return contentStream.read(); + } + + + @Override + public int hashCode() { + return contentStream.hashCode(); + } + + @Override + public int read(byte[] b) throws IOException { + return contentStream.read(b); + } + + @Override + public boolean equals(Object obj) { + return contentStream.equals(obj); + } + + @Override + public int read(byte[] b, int off, int len) throws IOException { + return contentStream.read(b, off, len); + } + + @Override + public long skip(long n) throws IOException { + return contentStream.skip(n); + } + + @Override + public String toString() { + return contentStream.toString(); + } + + @Override + public int available() throws IOException { + return contentStream.available(); + } + + @Override + public synchronized void mark(int readlimit) { + contentStream.mark(readlimit); + } + + @Override + public synchronized void reset() throws IOException { + contentStream.reset(); + } + + @Override + public boolean markSupported() { + return contentStream.markSupported(); + } + + + +} diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index 1381548e5..9cf9ce1a7 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -24,7 +24,9 @@ package net.yacy.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.io.IOException; +import java.io.InputStream; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; @@ -34,7 +36,9 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.HTTPInputStream; import net.yacy.crawler.CrawlSwitchboard; +import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.Latency; import net.yacy.kelondro.io.ByteCount; @@ -75,6 +79,208 @@ public final class HTTPLoader { Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); return doc; } + + /** + * Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance. + * @param request + * @param profile crawl profile + * @param retryCount remaining redirect retries count + * @param maxFileSize max file size to load. -1 means no limit. + * @param blacklistType blacklist type to use + * @param agent agent identifier + * @return an open input stream. Don't forget to close it. + * @throws IOException when an error occured + */ + public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount, + final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) + throws IOException { + if (retryCount < 0) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, "retry counter exceeded", -1); + throw new IOException( + "retry counter exceeded for URL " + request.url().toString() + ". Processing aborted.$"); + } + DigestURL url = request.url(); + + final String host = url.getHost(); + if (host == null || host.length() < 2) { + throw new IOException("host is not well-formed: '" + host + "'"); + } + final String path = url.getFile(); + int port = url.getPort(); + final boolean ssl = url.getProtocol().equals("https"); + if (port < 0) + port = (ssl) ? 443 : 80; + + // check if url is in blacklist + final String hostlow = host.toLowerCase(); + if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, + "url in blacklist", -1); + throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); + } + + // resolve yacy and yacyh domains + final AlternativeDomainNames yacyResolver = this.sb.peers; + if (yacyResolver != null) { + final String yAddress = yacyResolver.resolve(host); + if (yAddress != null) { + url = new DigestURL(url.getProtocol() + "://" + yAddress + path); + } + } + + // create a request header + final RequestHeader requestHeader = createRequestheader(request, agent); + + // HTTP-Client + final HTTPClient client = new HTTPClient(agent); + client.setRedirecting(false); // we want to handle redirection + // ourselves, so we don't index pages + // twice + client.setTimout(this.socketTimeout); + client.setHeader(requestHeader.entrySet()); + + // send request + client.GET(url, false); + final int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); + final ResponseHeader responseHeader = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); + String requestURLString = request.url().toNormalform(true); + + // check redirection + if (statusCode > 299 && statusCode < 310) { + + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode, + responseHeader, requestURLString); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { + // we have two use cases here: loading from a crawl or just + // loading the url. Check this: + if (profile != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) { + // put redirect url on the crawler queue to repeat a + // double-check + request.redirectURL(redirectionUrl); + this.sb.crawlStacker.stackCrawl(request); + // in the end we must throw an exception (even if this is + // not an error, just to abort the current process + throw new IOException("CRAWLER Redirect of URL=" + requestURLString + " to " + + redirectionUrl.toNormalform(false) + " placed on crawler queue for double-check"); + } + + // if we are already doing a shutdown we don't need to retry + // crawling + if (Thread.currentThread().isInterrupted()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_LOAD_CONTEXT, "server shutdown", statusCode); + throw new IOException( + "CRAWLER Redirect of URL=" + requestURLString + " aborted because of server shutdown.$"); + } + + // retry crawling with new url + request.redirectURL(redirectionUrl); + return openInputStream(request, profile, retryCount - 1, maxFileSize, blacklistType, agent); + } + // we don't want to follow redirects + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); + throw new IOException("REJECTED UNWANTED REDIRECTION '" + client.getHttpResponse().getStatusLine() + + "' for URL '" + requestURLString + "'$"); + } else if (statusCode == 200 || statusCode == 203) { + // the transfer is ok + + /* + * When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local + */ + long contentLength = client.getHttpResponse().getEntity().getContentLength(); + if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (1024 * 1024) && !url.isLocal()) { + byte[] content = HTTPClient.getByteArray(client.getHttpResponse().getEntity(), maxFileSize); + + try { + Cache.store(url, responseHeader, content); + } catch (final IOException e) { + this.log.warn("cannot write " + url + " to Cache (3): " + e.getMessage(), e); + } + + return new ByteArrayInputStream(content); + } + /* + * Returns a HTTPInputStream delegating to + * client.getContentstream(). Close method will ensure client is + * properly closed. + */ + return new HTTPInputStream(client); + } else { + // if the response has not the right response type then reject file + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, "wrong http status code", statusCode); + throw new IOException("REJECTED WRONG STATUS TYPE '" + client.getHttpResponse().getStatusLine() + + "' for URL '" + requestURLString + "'$"); + } + } + + /** + * Extract redirect URL from response header. Status code is supposed to be between 299 and 310. Parameters must not be null. + * @return redirect URL + * @throws IOException when an error occured + */ + private DigestURL extractRedirectURL(final Request request, CrawlProfile profile, DigestURL url, + final HTTPClient client, final int statusCode, final ResponseHeader responseHeader, String requestURLString) + throws IOException { + // read redirection URL + String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); + redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); + + if (redirectionUrlString.isEmpty()) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.TEMPORARY_NETWORK_FAILURE, + "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); + throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + + "' for URL '" + requestURLString + "'$"); + } + + // normalize URL + final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); + + // restart crawling with new url + this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + + requestURLString); + this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); + + this.sb.webStructure.generateCitationReference(url, redirectionUrl); + + if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, + FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); + } + return redirectionUrl; + } + + /** + * Create request header for loading content. + * @param request search request + * @param agent agent identification information + * @return a request header + * @throws IOException when an error occured + */ + private RequestHeader createRequestheader(final Request request, final ClientIdentification.Agent agent) + throws IOException { + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); + DigestURL refererURL = null; + if (request.referrerhash() != null) { + refererURL = this.sb.getURL(request.referrerhash()); + } + if (refererURL != null) { + requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); + } + requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); + requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, + this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); + requestHeader.put(HeaderFramework.ACCEPT_CHARSET, + this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, + this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); + return requestHeader; + } private Response load(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { @@ -112,15 +318,7 @@ public final class HTTPLoader { Response response = null; // create a request header - final RequestHeader requestHeader = new RequestHeader(); - requestHeader.put(HeaderFramework.USER_AGENT, agent.userAgent); - DigestURL refererURL = null; - if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash()); - if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); - requestHeader.put(HeaderFramework.ACCEPT, this.sb.getConfig("crawler.http.accept", DEFAULT_ACCEPT)); - requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, this.sb.getConfig("crawler.http.acceptLanguage", DEFAULT_LANGUAGE)); - requestHeader.put(HeaderFramework.ACCEPT_CHARSET, this.sb.getConfig("crawler.http.acceptCharset", DEFAULT_CHARSET)); - requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); + final RequestHeader requestHeader = createRequestheader(request, agent); // HTTP-Client final HTTPClient client = new HTTPClient(agent); @@ -137,27 +335,8 @@ public final class HTTPLoader { // check redirection if (statusCode > 299 && statusCode < 310) { - // read redirection URL - String redirectionUrlString = responseHeader.get(HeaderFramework.LOCATION); - redirectionUrlString = redirectionUrlString == null ? "" : redirectionUrlString.trim(); - - if (redirectionUrlString.isEmpty()) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.TEMPORARY_NETWORK_FAILURE, "no redirection url provided, field '" + HeaderFramework.LOCATION + "' is empty", statusCode); - throw new IOException("REJECTED EMTPY REDIRECTION '" + client.getHttpResponse().getStatusLine() + "' for URL '" + requestURLString + "'$"); - } - - // normalize URL - final DigestURL redirectionUrl = DigestURL.newURL(request.url(), redirectionUrlString); - - // restart crawling with new url - this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); - this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); - - this.sb.webStructure.generateCitationReference(url, redirectionUrl); - - if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_RECORD_REDIRECTS, true)) { - this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_REDIRECT_RULE, "redirect to " + redirectionUrlString, statusCode); - } + final DigestURL redirectionUrl = extractRedirectURL(request, profile, url, client, statusCode, + responseHeader, requestURLString); if (this.sb.getConfigBool(SwitchboardConstants.CRAWLER_FOLLOW_REDIRECTS, true)) { // we have two use cases here: loading from a crawl or just loading the url. Check this: diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 1da658f65..da52e15ab 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -26,8 +26,10 @@ package net.yacy.repository; +import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.util.Arrays; import java.util.Date; @@ -209,7 +211,82 @@ public final class LoaderDispatcher { } // check if we have the page in the cache - if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) { + Response response = loadFromCache(request, cacheStrategy, agent, url, crawlProfile); + if(response != null) { + return response; + } + + // check case where we want results from the cache exclusively, and never from the Internet (offline mode) + if (cacheStrategy == CacheStrategy.CACHEONLY) { + // we had a chance to get the content from the cache .. its over. We don't have it. + throw new IOException("cache only strategy"); + } + + // now forget about the cache, nothing there. Try to load the content from the Internet + + // check access time: this is a double-check (we checked possibly already in the balancer) + // to make sure that we don't DoS the target by mistake + checkAccessTime(agent, url); + + // now it's for sure that we will access the target. Remember the access time + if (host != null) { + if (accessTime.size() > accessTimeMaxsize) accessTime.clear(); // prevent a memory leak here + accessTime.put(host, System.currentTimeMillis()); + } + + // load resource from the internet + if (protocol.equals("http") || protocol.equals("https")) { + response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, agent); + } else if (protocol.equals("ftp")) { + response = this.ftpLoader.load(request, true); + } else if (protocol.equals("smb")) { + response = this.smbLoader.load(request, true); + } else if (protocol.equals("file")) { + response = this.fileLoader.load(request, true); + } else { + throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); + } + if (response == null) { + throw new IOException("no response (NULL) for url " + url); + } + if (response.getContent() == null) { + throw new IOException("empty response (code " + response.getStatus() + ") for url " + url.toNormalform(true)); + } + + // we got something. Now check if we want to store that to the cache + // first check looks if we want to store the content to the cache + if (crawlProfile == null || !crawlProfile.storeHTCache()) { + // no caching wanted. Thats ok, do not write any message + return response; + } + // second check tells us if the protocol tells us something about caching + final String storeError = response.shallStoreCacheForCrawler(); + if (storeError == null) { + try { + Cache.store(url, response.getResponseHeader(), response.getContent()); + } catch (final IOException e) { + LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); + } + } else { + LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (4): " + storeError); + } + return response; + } + + /** + * Try loading requested resource from cache according to cache strategy + * @param request request to resource + * @param cacheStrategy cache strategy to use + * @param agent agent identifier + * @param url resource url + * @param crawlProfile crawl profile + * @return a Response instance when resource could be loaded from cache, or null. + * @throws IOException when an error occured + */ + private Response loadFromCache(final Request request, CacheStrategy cacheStrategy, ClientIdentification.Agent agent, + final DigestURL url, final CrawlProfile crawlProfile) throws IOException { + Response response = null; + if (cacheStrategy != CacheStrategy.NOCACHE && crawlProfile != null) { // we have passed a first test if caching is allowed // now see if there is a cache entry @@ -224,7 +301,7 @@ public final class LoaderDispatcher { DigestURL refererURL = null; if (request.referrerhash() != null) refererURL = this.sb.getURL(request.referrerhash()); if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true)); - final Response response = new Response( + response = new Response( request, requestHeader, cachedResponse, @@ -258,6 +335,38 @@ public final class LoaderDispatcher { LoaderDispatcher.log.warn("HTCACHE contained response header, but not content for url " + url.toNormalform(true)); } } + return response; + } + + /** + * Open an InputStream on a resource from the web, from ftp, from smb or a file + * @param request the request essentials + * @param cacheStratgy strategy according to NOCACHE, IFFRESH, IFEXIST, CACHEONLY + * @return an open ImageInputStream. Don't forget to close it once used! + * @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable + */ + private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { + // get the protocol of the next URL + final DigestURL url = request.url(); + if (url.isFile() || url.isSMB()) { + cacheStrategy = CacheStrategy.NOCACHE; // load just from the file + // system + } + final String protocol = url.getProtocol(); + final String host = url.getHost(); + final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle())); + + // check if url is in blacklist + if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) { + this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1); + throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$"); + } + + // check if we have the page in the cache + Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile); + if(cachedResponse != null) { + return new ByteArrayInputStream(cachedResponse.getContent()); + } // check case where we want results from the cache exclusively, and never from the Internet (offline mode) if (cacheStrategy == CacheStrategy.CACHEONLY) { @@ -269,21 +378,7 @@ public final class LoaderDispatcher { // check access time: this is a double-check (we checked possibly already in the balancer) // to make sure that we don't DoS the target by mistake - if (!url.isLocal()) { - final Long lastAccess = accessTime.get(host); - long wait = 0; - if (lastAccess != null) wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis()); - if (wait > 0) { - // force a sleep here. Instead just sleep we clean up the accessTime map - final long untilTime = System.currentTimeMillis() + wait; - cleanupAccessTimeTable(untilTime); - if (System.currentTimeMillis() < untilTime) { - long frcdslp = untilTime - System.currentTimeMillis(); - LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host); - try {Thread.sleep(frcdslp);} catch (final InterruptedException ee) {} - } - } - } + checkAccessTime(agent, url); // now it's for sure that we will access the target. Remember the access time if (host != null) { @@ -292,44 +387,52 @@ public final class LoaderDispatcher { } // load resource from the internet - Response response = null; + InputStream inStream = null; if (protocol.equals("http") || protocol.equals("https")) { - response = this.httpLoader.load(request, crawlProfile, maxFileSize, blacklistType, agent); - } else if (protocol.equals("ftp")) { - response = this.ftpLoader.load(request, true); - } else if (protocol.equals("smb")) { - response = this.smbLoader.load(request, true); - } else if (protocol.equals("file")) { - response = this.fileLoader.load(request, true); + inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); + } else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) { + // may also open directly stream with ftp loader + inStream = url.getInputStream(agent, null, null); } else { throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } - if (response == null) { - throw new IOException("no response (NULL) for url " + url); - } - if (response.getContent() == null) { - throw new IOException("empty response (code " + response.getStatus() + ") for url " + url.toNormalform(true)); + if (inStream == null) { + throw new IOException("Unable to open content stream"); } - // we got something. Now check if we want to store that to the cache - // first check looks if we want to store the content to the cache - if (crawlProfile == null || !crawlProfile.storeHTCache()) { - // no caching wanted. Thats ok, do not write any message - return response; - } - // second check tells us if the protocol tells us something about caching - final String storeError = response.shallStoreCacheForCrawler(); - if (storeError == null) { - try { - Cache.store(url, response.getResponseHeader(), response.getContent()); - } catch (final IOException e) { - LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e); - } - } else { - LoaderDispatcher.log.warn("cannot write " + response.url() + " to Cache (4): " + storeError); - } - return response; + return inStream; } + + + /** + * Check access time: this is a double-check (we checked possibly already in the balancer) + * to make sure that we don't DoS the target by mistake + * @param agent agent identifier + * @param url target url + */ + private void checkAccessTime(ClientIdentification.Agent agent, final DigestURL url) { + if (!url.isLocal()) { + String host = url.getHost(); + final Long lastAccess = accessTime.get(host); + long wait = 0; + if (lastAccess != null) + wait = Math.max(0, agent.minimumDelta + lastAccess.longValue() - System.currentTimeMillis()); + if (wait > 0) { + // force a sleep here. Instead just sleep we clean up the + // accessTime map + final long untilTime = System.currentTimeMillis() + wait; + cleanupAccessTimeTable(untilTime); + if (System.currentTimeMillis() < untilTime) { + long frcdslp = untilTime - System.currentTimeMillis(); + LoaderDispatcher.log.info("Forcing sleep of " + frcdslp + " ms for host " + host); + try { + Thread.sleep(frcdslp); + } catch (final InterruptedException ee) { + } + } + } + } + } private int protocolMaxFileSize(final DigestURL url) { if (url.isHTTP() || url.isHTTPS()) @@ -357,6 +460,53 @@ public final class LoaderDispatcher { // read resource body (if it is there) return entry.getContent(); } + + /** + * Open url as InputStream from the web or the cache + * @param request must be not null + * @param cacheStrategy cache strategy to use + * @param blacklistType black list + * @param agent agent identification for HTTP requests + * @return an open InputStream on content. Don't forget to close it once used. + * @throws IOException when url is malformed or blacklisted + */ + public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy, + BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + final int maxFileSize = protocolMaxFileSize(request.url()); + InputStream stream = null; + + Semaphore check = this.loaderSteering.get(request.url()); + if (check != null && cacheStrategy != CacheStrategy.NOCACHE) { + // a loading process is going on for that url + long t = System.currentTimeMillis(); + try { + check.tryAcquire(5, TimeUnit.SECONDS); + } catch (final InterruptedException e) { + } + ConcurrentLog.info("LoaderDispatcher", + "waited " + (System.currentTimeMillis() - t) + " ms for " + request.url().toNormalform(true)); + // now the process may have terminated and we run a normal loading + // which may be successful faster because of a cache hit + } + + this.loaderSteering.put(request.url(), new Semaphore(0)); + try { + stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent); + } catch(IOException ioe) { + /* Do not re encapsulate eventual IOException in an IOException */ + throw ioe; + } catch (final Throwable e) { + throw new IOException(e); + } finally { + // release the semaphore anyway + check = this.loaderSteering.remove(request.url()); + if (check != null) { + check.release(1000); // don't block any other + } + } + + return stream; + } public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure { diff --git a/test/ViewImagePerfTest.java b/test/ViewImagePerfTest.java index 6b5459e71..5a13e30a3 100755 --- a/test/ViewImagePerfTest.java +++ b/test/ViewImagePerfTest.java @@ -8,6 +8,9 @@ import java.util.List; import java.util.Map; import java.util.TreeMap; +import javax.imageio.ImageIO; +import javax.imageio.stream.ImageInputStream; + import net.yacy.cora.util.ConcurrentLog; import net.yacy.peers.graphics.EncodedImage; import net.yacy.server.serverObjects; @@ -75,8 +78,9 @@ public class ViewImagePerfTest extends ViewImageTest { } /** - * Process inFile image, update processedFiles list and failures map, and append measurements to results_perfs.txt. All - * parameters must not be null. + * Process inFile image, update processedFiles list and failures map, and + * append measurements to results_perfs.txt. All parameters must not be + * null. * * @param ext * output encoding image format @@ -92,7 +96,7 @@ public class ViewImagePerfTest extends ViewImageTest { * when an read/write error occured */ @Override - protected void processFile(String ext, File outDir, serverObjects post, Map failures, + protected void processFile(String ext, File outDir, serverObjects post, Map failures, File inFile) throws IOException { /* Delete eventual previous result file */ System.out @@ -102,43 +106,43 @@ public class ViewImagePerfTest extends ViewImageTest { outFile.delete(); } - byte[] resourceb = getBytes(inFile); String urlString = inFile.getAbsolutePath(); EncodedImage img = null; Exception error = null; - try { - long beginTime, time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0; - int step = 0; - for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) { - beginTime = System.nanoTime(); - img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb); - time = System.nanoTime() - beginTime; - if (img == null) { - break; - } - minTime = Math.min(minTime, time); - maxTime = Math.max(maxTime, time); - totalTime += time; + long beginTime = System.nanoTime(), time, minTime = Long.MAX_VALUE, maxTime = 0, meanTime = 0, totalTime = 0; + int step = 0; + for (step = 0; (totalTime / 1000000000) < this.minMeasureTime; step++) { + beginTime = System.nanoTime(); + ImageInputStream inStream = ImageIO.createImageInputStream(inFile); + try { + img = ViewImage.parseAndScale(post, true, urlString, ext, inStream); + } catch (Exception e) { + error = e; } - if (img == null) { - System.out.println("Image could not be rendered!"); - } else { - meanTime = totalTime / step; - PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true)); - try { - writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : " - + ext, resultsWriter); - writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.", - resultsWriter); - writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter); - writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter); - writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter); - } finally { - resultsWriter.close(); - } + time = System.nanoTime() - beginTime; + minTime = Math.min(minTime, time); + maxTime = Math.max(maxTime, time); + totalTime += time; + } + if (step > 0) { + meanTime = totalTime / step; + } else { + meanTime = totalTime; + } + PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results_perfs.txt"), true)); + try { + writeMessage("Measured ViewImage render with file : " + inFile.getAbsolutePath() + " encoded To : " + ext, + resultsWriter); + if(img == null) { + writeMessage("Image could not be rendered! Measurement show time needed to read and parse image data until error detection.", resultsWriter); } - } catch (Exception e) { - error = e; + writeMessage("Render total time (ms) : " + (totalTime) / 1000000 + " on " + step + " steps.", + resultsWriter); + writeMessage("Render mean time (ms) : " + (meanTime) / 1000000, resultsWriter); + writeMessage("Render min time (ms) : " + (minTime) / 1000000, resultsWriter); + writeMessage("Render max time (ms) : " + (maxTime) / 1000000, resultsWriter); + } finally { + resultsWriter.close(); } if (img == null) { @@ -218,7 +222,7 @@ public class ViewImagePerfTest extends ViewImageTest { System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath()); List processedFiles = new ArrayList(); - Map failures = new TreeMap<>(); + Map failures = new TreeMap<>(); try { long time = System.nanoTime(); test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures); diff --git a/test/ViewImageTest.java b/test/ViewImageTest.java index ff0c8986c..a80a7d588 100755 --- a/test/ViewImageTest.java +++ b/test/ViewImageTest.java @@ -1,9 +1,7 @@ import java.io.File; -import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; -import java.io.InputStream; import java.io.PrintWriter; import java.net.URL; import java.util.ArrayList; @@ -12,6 +10,9 @@ import java.util.Map; import java.util.Map.Entry; import java.util.TreeMap; +import javax.imageio.ImageIO; +import javax.imageio.stream.ImageInputStream; + import net.yacy.cora.util.ConcurrentLog; import net.yacy.peers.graphics.EncodedImage; import net.yacy.server.serverObjects; @@ -52,24 +53,6 @@ public class ViewImageTest { /** Default output encoding format */ private static final String DEFAULT_OUT_EXT = "png"; - /** - * @param testFile - * file to load - * @return testFile content as a bytes array - * @throws IOException - * when an error occured while loading - */ - protected byte[] getBytes(File testFile) throws IOException { - InputStream inStream = new FileInputStream(testFile); - byte[] res = new byte[inStream.available()]; - try { - inStream.read(res); - } finally { - inStream.close(); - } - return res; - } - /** * @param args * main parameters. first item may contain input file or folder @@ -207,7 +190,7 @@ public class ViewImageTest { * @param processedFiles * all processed image files * @param failures - * map input file url which failed with eventual cause exception + * map input file url which failed with eventual cause error * @param time * total processing time in nanoseconds * @param outDir @@ -215,7 +198,7 @@ public class ViewImageTest { * @throws IOException * when a write error occured writing the results file */ - protected void displayResults(List processedFiles, Map failures, long time, File outDir) + protected void displayResults(List processedFiles, Map failures, long time, File outDir) throws IOException { PrintWriter resultsWriter = new PrintWriter(new FileWriter(new File(outDir, "results.txt"))); try { @@ -226,7 +209,7 @@ public class ViewImageTest { } else { writeMessage("Some input files could not be processed :", resultsWriter); } - for (Entry entry : failures.entrySet()) { + for (Entry entry : failures.entrySet()) { writeMessage(entry.getKey(), resultsWriter); if (entry.getValue() != null) { writeMessage("cause : " + entry.getValue(), resultsWriter); @@ -266,7 +249,7 @@ public class ViewImageTest { * when an read/write error occured */ protected void processFiles(String ext, boolean recursive, File outDir, serverObjects post, File[] inFiles, - List processedFiles, Map failures) throws IOException { + List processedFiles, Map failures) throws IOException { for (File inFile : inFiles) { if (inFile.isDirectory()) { if (recursive) { @@ -291,7 +274,7 @@ public class ViewImageTest { * @param inFile file image to process * @throws IOException when an read/write error occured */ - protected void processFile(String ext, File outDir, serverObjects post, Map failures, File inFile) + protected void processFile(String ext, File outDir, serverObjects post, Map failures, File inFile) throws IOException { /* Delete eventual previous result file */ File outFile = new File(outDir, inFile.getName() + "." + ext); @@ -299,13 +282,13 @@ public class ViewImageTest { outFile.delete(); } - byte[] resourceb = getBytes(inFile); + ImageInputStream inStream = ImageIO.createImageInputStream(inFile); String urlString = inFile.getAbsolutePath(); EncodedImage img = null; - Exception error = null; + Throwable error = null; try { - img = ViewImage.parseAndScale(post, true, urlString, ext, false, resourceb); - } catch (Exception e) { + img = ViewImage.parseAndScale(post, true, urlString, ext, inStream); + } catch (Throwable e) { error = e; } @@ -383,7 +366,7 @@ public class ViewImageTest { System.out.println("Rendered images will be written in dir : " + outDir.getAbsolutePath()); List processedFiles = new ArrayList(); - Map failures = new TreeMap<>(); + Map failures = new TreeMap<>(); try { long time = System.nanoTime(); test.processFiles(ext, recursive, outDir, post, inFiles, processedFiles, failures);