You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/document/parser/GenericXMLParser.java

215 lines
8.9 KiB

// GenericXMLParser.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.naming.SizeLimitExceededException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.StreamLimitException;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.GenericXMLContentHandler;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
/**
* A generic XML parser without knowledge of the specific XML vocabulary.
* @author luccioman
*
*/
public class GenericXMLParser extends AbstractParser implements Parser {
/** SAX parser instance local to each thread */
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
/**
* @return a SAXParser instance for the current thread
* @throws SAXException when an error prevented parser creation
*/
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public GenericXMLParser() {
super("XML Parser");
this.SUPPORTED_EXTENSIONS.add("xml");
this.SUPPORTED_MIME_TYPES.add("application/xml");
this.SUPPORTED_MIME_TYPES.add("text/xml");
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
final long availableMemory = MemoryControl.available();
final long maxBytes = (long)(availableMemory * 0.25);
final int maxChars;
if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
maxChars = Integer.MAX_VALUE;
} else {
maxChars = ((int)maxBytes) / Character.BYTES;
}
try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
final InputSource saxSource = new InputSource(reader);
final String detectedCharset = reader.getEncoding();
final List<AnchorURL> detectedURLs = new ArrayList<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
final SAXParser saxParser = getParser();
saxParser.parse(saxSource, saxHandler);
if (writer.isOverflow()) {
throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+ Formatter.bytesToString(availableMemory), location);
}
/* create the parsed document */
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
return docs;
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
/**
* {@inheritDoc}
* @param maxBytes the maximum number of content bytes to process. Be careful with to small values :
* a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.
*/
@Override
public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
final long availableMemory = MemoryControl.available();
final long maxTextBytes = (long)(availableMemory * 0.25);
final int maxChars;
if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {
maxChars = Integer.MAX_VALUE;
} else {
maxChars = ((int)maxTextBytes) / Character.BYTES;
}
try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
final Set<AnchorURL> detectedURLs = new HashSet<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);
StrictLimitInputStream limitedSource = new StrictLimitInputStream(source, maxBytes);
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);
final InputSource saxSource = new InputSource(reader);
final String detectedCharset = reader.getEncoding();
final SAXParser saxParser = getParser();
boolean limitExceeded = false;
try {
saxParser.parse(saxSource, saxHandler);
} catch(SAXException e) {
if(!(e.getCause() instanceof SizeLimitExceededException)) {
/* Only transmit to upper layer exceptions that are not caused by the maxLinks limit being reached */
throw e;
}
limitExceeded = true;
} catch(StreamLimitException e) {
limitExceeded = true;
}
if (writer.isOverflow()) {
throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+ Formatter.bytesToString(availableMemory), location);
}
/* Create the parsed document with eventually only partial part of the text and links */
final byte[] contentBytes = UTF8.getBytes(writer.toString());
Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
docs[0].setPartiallyParsed(limitExceeded);
return docs;
} catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
}
}
}