You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/crawler/retrieval/StreamResponse.java

121 lines
3.7 KiB

// StreamResponse.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
/**
* A crawler load response, holding content as a stream.
*/
public class StreamResponse {
/** Logger */
private final static ConcurrentLog log = new ConcurrentLog(StreamResponse.class.getSimpleName());
/**
* Content as a stream.
*/
private InputStream contentStream;
/**
* The response details, including notably the request and response headers.
*/
private Response response;
/**
* @param response
* contains the complete crawler response details
* @param contentStream
* an open input stream on the response content
* @throws IllegalArgumentException
* when response is null
*/
public StreamResponse(final Response response, final InputStream contentStream) {
if (response == null) {
throw new IllegalArgumentException("response parameter must not be null");
}
this.response = response;
this.contentStream = contentStream;
}
/**
* @return the content stream. Don't forget to close it when processing is
* terminated.
*/
public InputStream getContentStream() {
return this.contentStream;
}
/**
* @return the crawler response with complete details
*/
public Response getResponse() {
return this.response;
}
/**
* Parse and close the content stream and return the parsed documents when
* possible
*
* @return the parsed documents or null when an error occurred
* @throws Parser.Failure
* when no parser support the content
*/
public Document[] parse() throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(),
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
return TextParser.parseSource(this.response.url(),
this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType(),
this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding(),
new VocabularyScraper(), this.response.getRequest().timezoneOffset(),
this.response.getRequest().depth(), this.response.size(), this.contentStream);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
}
}