|
|
|
// StreamResponse.java
|
|
|
|
// ---------------------------
|
|
|
|
// Copyright 2017 by luccioman; https://github.com/luccioman
|
|
|
|
//
|
|
|
|
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
|
|
//
|
|
|
|
// LICENSE
|
|
|
|
//
|
|
|
|
// This program is free software; you can redistribute it and/or modify
|
|
|
|
// it under the terms of the GNU General Public License as published by
|
|
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
|
|
// (at your option) any later version.
|
|
|
|
//
|
|
|
|
// This program is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
// GNU General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU General Public License
|
|
|
|
// along with this program; if not, write to the Free Software
|
|
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
|
|
|
|
package net.yacy.crawler.retrieval;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
|
|
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
|
|
import net.yacy.document.Document;
|
|
|
|
import net.yacy.document.Parser;
|
|
|
|
import net.yacy.document.TextParser;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* A crawler load response, holding content as a stream.
|
|
|
|
*/
|
|
|
|
public class StreamResponse {
|
|
|
|
|
|
|
|
/** Logger */
|
|
|
|
private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Content as a stream.
|
|
|
|
*/
|
|
|
|
private InputStream contentStream;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* The response details, including notably the request and response headers.
|
|
|
|
*/
|
|
|
|
private Response response;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @param response
|
|
|
|
* contains the complete crawler response details
|
|
|
|
* @param contentStream
|
|
|
|
* an open input stream on the response content
|
|
|
|
* @throws IllegalArgumentException
|
|
|
|
* when response is null
|
|
|
|
*/
|
|
|
|
public StreamResponse(final Response response, final InputStream contentStream) {
|
|
|
|
if (response == null) {
|
|
|
|
throw new IllegalArgumentException("response parameter must not be null");
|
|
|
|
}
|
|
|
|
this.response = response;
|
|
|
|
this.contentStream = contentStream;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the content stream. Don't forget to close it when processing is
|
|
|
|
* terminated.
|
|
|
|
*/
|
|
|
|
public InputStream getContentStream() {
|
|
|
|
return this.contentStream;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return the crawler response with complete details
|
|
|
|
*/
|
|
|
|
public Response getResponse() {
|
|
|
|
return this.response;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse and close the content stream and return the parsed documents when
|
|
|
|
* possible
|
|
|
|
*
|
|
|
|
* @return the parsed documents or null when an error occurred
|
|
|
|
* @throws Parser.Failure
|
|
|
|
* when no parser support the content
|
|
|
|
*/
|
|
|
|
public Document[] parse() throws Parser.Failure {
|
|
|
|
return parseWithLimits(Integer.MAX_VALUE, Long.MAX_VALUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Parse and close the content stream and return the parsed documents when
|
|
|
|
* possible.<br>
|
|
|
|
* Try to limit the parser processing with a maximum total number of links
|
|
|
|
* detection (anchors, images links, media links...) or a maximum amount of
|
|
|
|
* content bytes to parse.<br>
|
|
|
|
* Limits apply only when the available parsers for the resource media type
|
|
|
|
* support parsing within limits (see
|
|
|
|
* {@link Parser#isParseWithLimitsSupported()}. When available parsers do
|
|
|
|
* not support parsing within limits, an exception is thrown when
|
|
|
|
* content size is beyond maxBytes.
|
|
|
|
*
|
|
|
|
* @param maxLinks
|
|
|
|
* the maximum total number of links to parse and add to the
|
|
|
|
* result documents
|
|
|
|
* @param maxBytes
|
|
|
|
* the maximum number of content bytes to process
|
|
|
|
* @return the parsed documents or null when an error occurred
|
|
|
|
* @throws Parser.Failure
|
|
|
|
* when no parser support the content, or an error occurred while parsing
|
|
|
|
*/
|
|
|
|
public Document[] parseWithLimits(final int maxLinks, final long maxBytes) throws Parser.Failure {
|
|
|
|
final String supportError = TextParser.supports(this.response.url(),
|
|
|
|
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
|
|
|
|
if (supportError != null) {
|
|
|
|
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
final String mimeType = this.response.getResponseHeader() == null ? null
|
|
|
|
: this.response.getResponseHeader().getContentType();
|
|
|
|
final String charsetName = this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
|
|
|
|
: this.response.getResponseHeader().getCharacterEncoding();
|
|
|
|
|
|
|
|
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
|
|
|
|
this.response.getRequest().timezoneOffset(), this.response.getRequest().depth(),
|
|
|
|
this.response.size(), this.contentStream, maxLinks, maxBytes);
|
|
|
|
} catch(Parser.Failure e) {
|
|
|
|
throw e;
|
|
|
|
}catch (final Exception e) {
|
|
|
|
return null;
|
|
|
|
} finally {
|
|
|
|
if (this.contentStream != null) {
|
|
|
|
try {
|
|
|
|
this.contentStream.close();
|
|
|
|
} catch (IOException ignored) {
|
|
|
|
log.warn("Could not close content stream on url " + this.response.url());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|