yacy_search_server/source/net/yacy/crawler/retrieval/StreamResponse.java

// StreamResponse.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.crawler.retrieval;

import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;

import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;

/**
 * A crawler load response, holding content as a stream.
 */
public class StreamResponse {

	/** Logger */
	private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());

	/**
	 * Content as a stream.
	 */
	private InputStream contentStream;

	/**
	 * The response details, including notably the request and response headers.
	 */
	private Response response;

	/**
	 * @param response
	 *            contains the complete crawler response details
	 * @param contentStream
	 *            an open input stream on the response content
	 * @throws IllegalArgumentException
	 *             when response is null
	 */
	public StreamResponse(final Response response, final InputStream contentStream) {
		if (response == null) {
			throw new IllegalArgumentException("response parameter must not be null");
		}
		this.response = response;
		this.contentStream = contentStream;
	}

	/**
	 * @return the content stream. Don't forget to close it when processing is
	 *         terminated.
	 */
	public InputStream getContentStream() {
		return this.contentStream;
	}

	/**
	 * @return the crawler response with complete details
	 */
	public Response getResponse() {
		return this.response;
	}

	/**
	 * Parse and close the content stream and return the parsed documents when
	 * possible
	 * 
	 * @return the parsed documents or null when an error occurred
	 * @throws Parser.Failure
	 *             when no parser support the content
	 */
	public Document[] parse() throws Parser.Failure {
		final String supportError = TextParser.supports(this.response.url(),
				this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
		if (supportError != null) {
			throw new Parser.Failure("no parser support:" + supportError, this.response.url());
		}
		try {
			return TextParser.parseSource(this.response.url(),
					this.response.getResponseHeader() == null ? null
							: this.response.getResponseHeader().getContentType(),
					this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
							: this.response.getResponseHeader().getCharacterEncoding(),
					new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
					this.response.getRequest().depth(), this.response.size(), this.contentStream);
		} catch (final Exception e) {
			return null;
		} finally {
			if (this.contentStream != null) {
				try {
					this.contentStream.close();
				} catch (IOException ignored) {
					log.warn("Could not close content stream on url " + this.response.url());
				}
			}
		}

	}

}
Improved consistency between loader openInputStream and load functions 8 years ago			`// StreamResponse.java`
			`// ---------------------------`
			`// Copyright 2017 by luccioman; https://github.com/luccioman`
			`//`
			`// This is a part of YaCy, a peer-to-peer based web search engine`
			`//`
			`// LICENSE`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

			`package net.yacy.crawler.retrieval;`

			`import java.io.IOException;`
			`import java.io.InputStream;`
			`import java.nio.charset.StandardCharsets;`

			`import net.yacy.cora.util.ConcurrentLog;`
			`import net.yacy.document.Document;`
			`import net.yacy.document.Parser;`
			`import net.yacy.document.TextParser;`
			`import net.yacy.document.VocabularyScraper;`

			`/**`
			`* A crawler load response, holding content as a stream.`
			`*/`
			`public class StreamResponse {`

			`/** Logger */`
			`private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());`

			`/**`
			`* Content as a stream.`
			`*/`
			`private InputStream contentStream;`

			`/**`
			`* The response details, including notably the request and response headers.`
			`*/`
			`private Response response;`

			`/**`
			`* @param response`
			`* contains the complete crawler response details`
			`* @param contentStream`
			`* an open input stream on the response content`
			`* @throws IllegalArgumentException`
			`* when response is null`
			`*/`
			`public StreamResponse(final Response response, final InputStream contentStream) {`
			`if (response == null) {`
			`throw new IllegalArgumentException("response parameter must not be null");`
			`}`
			`this.response = response;`
			`this.contentStream = contentStream;`
			`}`

			`/**`
			`* @return the content stream. Don't forget to close it when processing is`
			`* terminated.`
			`*/`
			`public InputStream getContentStream() {`
			`return this.contentStream;`
			`}`

			`/**`
			`* @return the crawler response with complete details`
			`*/`
			`public Response getResponse() {`
			`return this.response;`
			`}`

			`/**`
			`* Parse and close the content stream and return the parsed documents when`
			`* possible`
			`*`
			`* @return the parsed documents or null when an error occurred`
			`* @throws Parser.Failure`
			`* when no parser support the content`
			`*/`
			`public Document[] parse() throws Parser.Failure {`
			`final String supportError = TextParser.supports(this.response.url(),`
			`this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());`
			`if (supportError != null) {`
			`throw new Parser.Failure("no parser support:" + supportError, this.response.url());`
			`}`
			`try {`
			`return TextParser.parseSource(this.response.url(),`
			`this.response.getResponseHeader() == null ? null`
			`: this.response.getResponseHeader().getContentType(),`
			`this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()`
			`: this.response.getResponseHeader().getCharacterEncoding(),`
			`new VocabularyScraper(), this.response.getRequest().timezoneOffset(),`
			`this.response.getRequest().depth(), this.response.size(), this.contentStream);`
			`} catch (final Exception e) {`
			`return null;`
			`} finally {`
			`if (this.contentStream != null) {`
			`try {`
			`this.contentStream.close();`
			`} catch (IOException ignored) {`
			`log.warn("Could not close content stream on url " + this.response.url());`
			`}`
			`}`
			`}`

			`}`

			`}`