From a9cb083fa135819b56c98561d60f2ab39d2fd8f6 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 2 Jun 2017 01:46:06 +0200 Subject: [PATCH] Improved consistency between loader openInputStream and load functions --- .../net/yacy/crawler/retrieval/FTPLoader.java | 24 ++++ .../yacy/crawler/retrieval/FileLoader.java | 40 ++++-- .../yacy/crawler/retrieval/HTTPLoader.java | 34 +++-- .../net/yacy/crawler/retrieval/Response.java | 11 ++ .../net/yacy/crawler/retrieval/SMBLoader.java | 38 +++++- .../crawler/retrieval/StreamResponse.java | 120 ++++++++++++++++++ .../net/yacy/repository/LoaderDispatcher.java | 80 +++++++++--- .../net/yacy/visualization/ImageViewer.java | 4 +- 8 files changed, 300 insertions(+), 51 deletions(-) create mode 100644 source/net/yacy/crawler/retrieval/StreamResponse.java diff --git a/source/net/yacy/crawler/retrieval/FTPLoader.java b/source/net/yacy/crawler/retrieval/FTPLoader.java index 0d2dfeb17..c0994d0d7 100644 --- a/source/net/yacy/crawler/retrieval/FTPLoader.java +++ b/source/net/yacy/crawler/retrieval/FTPLoader.java @@ -27,6 +27,7 @@ package net.yacy.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintStream; @@ -166,6 +167,29 @@ public class FTPLoader { Latency.updateAfterLoad(request.url(), System.currentTimeMillis() - start); return response; } + + /** + * Open a stream on the entry content from a FTP server + * + * @param request the request to process + * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. + */ + public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException { + + final Response response = load(request, acceptOnlyParseable); + // TODO implement a true ftp content stream instead of a simple ByteArrayInputStream encapsulation + final StreamResponse streamResponse; + if(response.getContent() != null) { + streamResponse = new StreamResponse(response, + new ByteArrayInputStream(response.getContent())); + } else { + /* content can be null when no parser can handle it : then return the URL tokens as content */ + streamResponse = new StreamResponse(response, + new ByteArrayInputStream(UTF8.getBytes(request.url().toTokens()))); + } + return streamResponse; + } /** * @param ftpClient diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 96495b470..5c07a5cd5 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -24,6 +24,7 @@ package net.yacy.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; @@ -58,7 +59,31 @@ public class FileLoader { this.maxFileSize = (int) sb.getConfigLong("crawler.file.maxFileSize", -1l); } + /** + * Load fully the requested file in a byte buffer + * + * @param request the request to process + * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens + * @return a response with full meta data and embedding the content as a byte buffer + */ public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { + StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable); + + /* Read fully the stream and update the response */ + byte[] content = FileUtils.read(streamResponse.getContentStream()); + Response response = streamResponse.getResponse(); + response.setContent(content); + return response; + } + + /** + * Open a stream on the requested file + * + * @param request the request to process + * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, open a stream on the URL tokens + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. + */ + public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException { DigestURL url = request.url(); if (!url.getProtocol().equals("file")) throw new IOException("wrong protocol for FileLoader: " + url.getProtocol()); @@ -93,9 +118,9 @@ public class FileLoader { responseHeader, profile, false, - UTF8.getBytes(content.toString())); + null); - return response; + return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString()))); } // create response header @@ -133,13 +158,12 @@ public class FileLoader { responseHeader, profile, false, - UTF8.getBytes(url.toTokens())); - return response; + null); + return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(url.toTokens()))); } // load the resource - InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); - byte[] b = FileUtils.read(is); + final InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); // create response with loaded content final CrawlProfile profile = this.sb.crawler.get(ASCII.getBytes(request.profileHandle())); @@ -149,7 +173,7 @@ public class FileLoader { responseHeader, profile, false, - b); - return response; + null); + return new StreamResponse(response, is); } } diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index cea02f278..d795be108 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; +import org.apache.http.HttpStatus; import org.apache.http.StatusLine; import net.yacy.cora.document.id.DigestURL; @@ -82,18 +83,19 @@ public final class HTTPLoader { return doc; } - /** - * Open input stream on a requested HTTP resource. When resource is small, fully load it and returns a ByteArrayInputStream instance. + /** + * Open an input stream on a requested HTTP resource. When the resource content size is small + * (lower than {@link Response#CRAWLER_MAX_SIZE_TO_CACHE}, fully load it and use a ByteArrayInputStream instance. * @param request * @param profile crawl profile * @param retryCount remaining redirect retries count * @param maxFileSize max file size to load. -1 means no limit. * @param blacklistType blacklist type to use * @param agent agent identifier - * @return an open input stream. Don't forget to close it. - * @throws IOException when an error occured + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. + * @throws IOException when an error occurred */ - public InputStream openInputStream(final Request request, CrawlProfile profile, final int retryCount, + public StreamResponse openInputStream(final Request request, CrawlProfile profile, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { if (retryCount < 0) { @@ -200,13 +202,14 @@ public final class HTTPLoader { FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); throw new IOException("REJECTED UNWANTED REDIRECTION '" + statusline + "' for URL '" + requestURLString + "'$"); - } else if (statusCode == 200 || statusCode == 203) { + } else if (statusCode == HttpStatus.SC_OK || statusCode == HttpStatus.SC_NON_AUTHORITATIVE_INFORMATION) { // the transfer is ok /* - * When content is not large (less than 1MB), we have better cache it if cache is enabled and url is not local + * When content is not large (less than Response.CRAWLER_MAX_SIZE_TO_CACHE), we have better cache it if cache is enabled and url is not local */ long contentLength = client.getHttpResponse().getEntity().getContentLength(); + final InputStream contentStream; if (profile != null && profile.storeHTCache() && contentLength > 0 && contentLength < (Response.CRAWLER_MAX_SIZE_TO_CACHE) && !url.isLocal()) { byte[] content = null; try { @@ -218,14 +221,17 @@ public final class HTTPLoader { client.finish(); } - return new ByteArrayInputStream(content); + contentStream = new ByteArrayInputStream(content); + } else { + /* + * Create a HTTPInputStream delegating to + * client.getContentstream(). Close method will ensure client is + * properly closed. + */ + contentStream = new HTTPInputStream(client); } - /* - * Returns a HTTPInputStream delegating to - * client.getContentstream(). Close method will ensure client is - * properly closed. - */ - return new HTTPInputStream(client); + + return new StreamResponse(new Response(request, requestHeader, responseHeader, profile, false, null), contentStream); } else { client.finish(); // if the response has not the right response type then reject file diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 2b7edc334..e4d1f41d0 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -225,10 +225,21 @@ public class Response { public void updateStatus(final int newStatus) { this.status = newStatus; } + + /** + * @return the original request that produced this response + */ + public Request getRequest() { + return request; + } public ResponseHeader getResponseHeader() { return this.responseHeader; } + + public RequestHeader getRequestHeader() { + return this.requestHeader; + } public boolean fromCache() { return this.fromCache; diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index 79661ee71..4cf4d37d5 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -27,6 +27,7 @@ package net.yacy.crawler.retrieval; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; @@ -69,7 +70,31 @@ public class SMBLoader { } + /** + * Load fully the requested file in a byte buffer + * + * @param request the request to process + * @param acceptOnlyParseable when true and no parser can be found to handle the detected MIME type, the response content buffer contains only URL tokens + * @return a response with full meta data and embedding the content as a byte buffer + */ public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { + StreamResponse streamResponse = openInputStream(request, acceptOnlyParseable); + + /* Read fully the stream and update the response */ + byte[] content = FileUtils.read(streamResponse.getContentStream()); + Response response = streamResponse.getResponse(); + response.setContent(content); + return response; + } + + /** + * Open a stream on the requested file + * + * @param request the request to process + * @param acceptOnlyParseable when true, do not open a stream on content when no parser can be found to handle the detected MIME type + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. + */ + public StreamResponse openInputStream(final Request request, final boolean acceptOnlyParseable) throws IOException { DigestURL url = request.url(); if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol()); @@ -111,9 +136,9 @@ public class SMBLoader { responseHeader, profile, false, - UTF8.getBytes(content.toString())); + null); - return response; + return new StreamResponse(response, new ByteArrayInputStream(UTF8.getBytes(content.toString()))); } // create response header @@ -151,13 +176,12 @@ public class SMBLoader { responseHeader, profile, false, - url.toTokens().getBytes()); - return response; + null); + return new StreamResponse(response, new ByteArrayInputStream(url.toTokens().getBytes())); } // load the resource InputStream is = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); - byte[] b = FileUtils.read(is); // create response with loaded content final CrawlProfile profile = this.sb.crawler.get(request.profileHandle().getBytes()); @@ -167,8 +191,8 @@ public class SMBLoader { responseHeader, profile, false, - b); - return response; + null); + return new StreamResponse(response, is); } public static void main(String[] args) { diff --git a/source/net/yacy/crawler/retrieval/StreamResponse.java b/source/net/yacy/crawler/retrieval/StreamResponse.java new file mode 100644 index 000000000..19a78501b --- /dev/null +++ b/source/net/yacy/crawler/retrieval/StreamResponse.java @@ -0,0 +1,120 @@ +// StreamResponse.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.crawler.retrieval; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; + +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.VocabularyScraper; + +/** + * A crawler load response, holding content as a stream. + */ +public class StreamResponse { + + /** Logger */ + private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName()); + + /** + * Content as a stream. + */ + private InputStream contentStream; + + /** + * The response details, including notably the request and response headers. + */ + private Response response; + + /** + * @param response + * contains the complete crawler response details + * @param contentStream + * an open input stream on the response content + * @throws IllegalArgumentException + * when response is null + */ + public StreamResponse(final Response response, final InputStream contentStream) { + if (response == null) { + throw new IllegalArgumentException("response parameter must not be null"); + } + this.response = response; + this.contentStream = contentStream; + } + + /** + * @return the content stream. Don't forget to close it when processing is + * terminated. + */ + public InputStream getContentStream() { + return this.contentStream; + } + + /** + * @return the crawler response with complete details + */ + public Response getResponse() { + return this.response; + } + + /** + * Parse and close the content stream and return the parsed documents when + * possible + * + * @return the parsed documents or null when an error occurred + * @throws Parser.Failure + * when no parser support the content + */ + public Document[] parse() throws Parser.Failure { + final String supportError = TextParser.supports(this.response.url(), + this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType()); + if (supportError != null) { + throw new Parser.Failure("no parser support:" + supportError, this.response.url()); + } + try { + return TextParser.parseSource(this.response.url(), + this.response.getResponseHeader() == null ? null + : this.response.getResponseHeader().getContentType(), + this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name() + : this.response.getResponseHeader().getCharacterEncoding(), + new VocabularyScraper(), this.response.getRequest().timezoneOffset(), + this.response.getRequest().depth(), this.response.size(), this.contentStream); + } catch (final Exception e) { + return null; + } finally { + if (this.contentStream != null) { + try { + this.contentStream.close(); + } catch (IOException ignored) { + log.warn("Could not close content stream on url " + this.response.url()); + } + } + } + + } + +} diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index e8f04318f..32aa2b4d9 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -29,7 +29,6 @@ package net.yacy.repository; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.io.InputStream; import java.net.MalformedURLException; import java.util.Arrays; import java.util.Date; @@ -59,6 +58,7 @@ import net.yacy.crawler.retrieval.HTTPLoader; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.SMBLoader; +import net.yacy.crawler.retrieval.StreamResponse; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; @@ -347,7 +347,7 @@ public final class LoaderDispatcher { * @return an open ImageInputStream. Don't forget to close it once used! * @throws IOException when url is malformed, blacklisted, or CacheStrategy is CACHEONLY and content is unavailable */ - private InputStream openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { + private StreamResponse openInputStreamInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, ClientIdentification.Agent agent) throws IOException { // get the protocol of the next URL final DigestURL url = request.url(); if (url.isFile() || url.isSMB()) { @@ -366,9 +366,9 @@ public final class LoaderDispatcher { // check if we have the page in the cache Response cachedResponse = loadFromCache(request, cacheStrategy, agent, url, crawlProfile); - if(cachedResponse != null) { - return new ByteArrayInputStream(cachedResponse.getContent()); - } + if (cachedResponse != null) { + return new StreamResponse(cachedResponse, new ByteArrayInputStream(cachedResponse.getContent())); + } // check case where we want results from the cache exclusively, and never from the Internet (offline mode) if (cacheStrategy == CacheStrategy.CACHEONLY) { @@ -389,20 +389,20 @@ public final class LoaderDispatcher { } // load resource from the internet - InputStream inStream = null; + StreamResponse response; if (protocol.equals("http") || protocol.equals("https")) { - inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); - } else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) { - // may also open directly stream with ftp loader - inStream = url.getInputStream(agent); + response = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); + } else if (protocol.equals("ftp")) { + response = this.ftpLoader.openInputStream(request, true); + } else if (protocol.equals("smb")) { + response = this.smbLoader.openInputStream(request, true); + } else if (protocol.equals("file")) { + response = this.fileLoader.openInputStream(request, true); } else { throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } - if (inStream == null) { - throw new IOException("Unable to open content stream"); - } - return inStream; + return response; } @@ -464,18 +464,18 @@ public final class LoaderDispatcher { } /** - * Open url as InputStream from the web or the cache + * Open the URL as an InputStream from the web or the cache * @param request must be not null * @param cacheStrategy cache strategy to use * @param blacklistType black list * @param agent agent identification for HTTP requests - * @return an open InputStream on content. Don't forget to close it once used. + * @return a response with full meta data and embedding on open input stream on content. Don't forget to close the stream. * @throws IOException when url is malformed or blacklisted */ - public InputStream openInputStream(final Request request, final CacheStrategy cacheStrategy, + public StreamResponse openInputStream(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { final int maxFileSize = protocolMaxFileSize(request.url()); - InputStream stream = null; + StreamResponse response; Semaphore check = this.loaderSteering.get(request.url()); if (check != null && cacheStrategy != CacheStrategy.NOCACHE) { @@ -493,9 +493,9 @@ public final class LoaderDispatcher { this.loaderSteering.put(request.url(), new Semaphore(0)); try { - stream = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent); + response = openInputStreamInternal(request, cacheStrategy, maxFileSize, blacklistType, agent); } catch(IOException ioe) { - /* Do not re encapsulate eventual IOException in an IOException */ + /* Do not re encapsulate any eventual IOException in an IOException */ throw ioe; } catch (final Throwable e) { throw new IOException(e); @@ -507,7 +507,7 @@ public final class LoaderDispatcher { } } - return stream; + return response; } public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException, Parser.Failure { @@ -554,6 +554,44 @@ public final class LoaderDispatcher { throw new IOException(e.getMessage()); } } + + /** + * Similar to the loadDocument method, but streaming the resource content when possible instead of fully loading it in memory. + * @param location URL of the resource to load + * @param cachePolicy cache policy strategy + * @param blacklistType blacklist to use + * @param agent user agent identifier + * @return on parsed document or null when an error occurred while parsing + * @throws IOException when the content can not be fetched or no parser support it + */ + public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + // load resource + Request request = request(location, true, false); + final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent); + final Response response = streamResponse.getResponse(); + final DigestURL url = request.url(); + if (response == null) throw new IOException("no Response for url " + url); + + // if it is still not available, report an error + if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) { + throw new IOException("no Content available for url " + url); + } + + // parse resource + try { + Document[] documents = streamResponse.parse(); + Document merged = Document.mergeDocuments(location, response.getMimeType(), documents); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) { + merged.setIndexingDenied(true); + } + + return merged; + } catch(final Parser.Failure e) { + throw new IOException(e.getMessage()); + } + } /** * load all links from a resource diff --git a/source/net/yacy/visualization/ImageViewer.java b/source/net/yacy/visualization/ImageViewer.java index f701e72dd..1b886972b 100644 --- a/source/net/yacy/visualization/ImageViewer.java +++ b/source/net/yacy/visualization/ImageViewer.java @@ -44,6 +44,7 @@ import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.retrieval.StreamResponse; import net.yacy.data.InvalidURLLicenceException; import net.yacy.data.URLLicense; import net.yacy.http.servlets.TemplateMissingParameterException; @@ -122,8 +123,9 @@ public class ImageViewer { String agentName = post.get("agentName", auth ? ClientIdentification.yacyIntranetCrawlerAgentName : ClientIdentification.yacyInternetCrawlerAgentName); ClientIdentification.Agent agent = ClientIdentification.getAgent(agentName); - inStream = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST, + final StreamResponse response = loader.openInputStream(loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, agent); + inStream = response.getContentStream(); } catch (final IOException e) { /** No need to log full stack trace (in most cases resource is not available because of a network error) */ ConcurrentLog.fine("ImageViewer", "cannot load image. URL : " + url.toNormalform(true));