yacy_search_server/source/net/yacy/document/parser/GenericXMLParser.java

// GenericXMLParser.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.document.parser;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

import javax.naming.SizeLimitExceededException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.GenericXMLContentHandler;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;

/**
 * A generic XML parser without knowledge of the specific XML vocabulary.
 * @author luccioman
 *
 */
public class GenericXMLParser extends AbstractParser implements Parser {
	
	/** SAX parser instance local to each thread */
    private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
    
    /**
     * @return a SAXParser instance for the current thread
     * @throws SAXException when an error prevented parser creation
     */
    private static SAXParser getParser() throws SAXException {
    	SAXParser parser = tlSax.get();
    	if (parser == null) {
    		try {
				parser = SAXParserFactory.newInstance().newSAXParser();
			} catch (final ParserConfigurationException e) {
				throw new SAXException(e.getMessage(), e);
			}
    		tlSax.set(parser);
    	}
    	return parser;
    }

    public GenericXMLParser() {
        super("XML Parser");
        this.SUPPORTED_EXTENSIONS.add("xml");
        this.SUPPORTED_MIME_TYPES.add("application/xml");
        this.SUPPORTED_MIME_TYPES.add("text/xml");
    }

    @Override
    public Document[] parse(
            final DigestURL location,
            final String mimeType,
            final String charset,
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
            throws Failure {
    	
    	/* Limit the size of the in-memory buffer to at most 25% of the available memory :
    	 * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
    	 * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
    	final long availableMemory = MemoryControl.available();
    	final long maxBytes = (long)(availableMemory * 0.25);
    	final int maxChars;
    	if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
    		maxChars = Integer.MAX_VALUE;
    	} else {
    		maxChars = ((int)maxBytes) / Character.BYTES;
    	}
    	
        try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){

        	/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
        	 * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
        	final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
			final InputSource saxSource = new InputSource(reader);
			final String detectedCharset = reader.getEncoding();

			final List<AnchorURL> detectedURLs = new ArrayList<>();

			final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
			final SAXParser saxParser = getParser();
			saxParser.parse(saxSource, saxHandler);

			if (writer.isOverflow()) {
				throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
						+ Formatter.bytesToString(availableMemory), location);
			}

			/* create the parsed document */
			Document[] docs = null;
			final byte[] contentBytes = UTF8.getBytes(writer.toString());
			docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
			return docs;
		} catch(Parser.Failure e) {
			throw e;
		} catch (final Exception e) {
			throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
		}

	}
    
    @Override
    public boolean isParseWithLimitsSupported() {
    	return true;
    }
    
    /**
     * {@inheritDoc}
     * @param maxBytes the maximum number of content bytes to process. Be careful with to small values : 
     * 	a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.
     */
    @Override
    public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper,
    		int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
    		throws Failure, InterruptedException, UnsupportedOperationException {
    	/* Limit the size of the in-memory buffer to at most 25% of the available memory :
    	 * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
    	 * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
    	final long availableMemory = MemoryControl.available();
    	final long maxTextBytes = (long)(availableMemory * 0.25);
    	final int maxChars;
    	if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {
    		maxChars = Integer.MAX_VALUE;
    	} else {
    		maxChars = ((int)maxTextBytes) / Character.BYTES;
    	}
    	
        try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){

			final Set<AnchorURL> detectedURLs = new HashSet<>();
			final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);
			
			InputStream limitedSource = new LimitedInputStream(source, maxBytes) {
				
				@Override
				protected void raiseError(long pSizeMax, long pCount) throws IOException {
					throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes));
					
				}
			};
        	
        	/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
        	 * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
        	final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);
			final InputSource saxSource = new InputSource(reader);
			final String detectedCharset = reader.getEncoding();

			final SAXParser saxParser = getParser();
			boolean limitExceeded = false;
			try {
				saxParser.parse(saxSource, saxHandler);
			} catch(SAXException | IOException e) {
				if(!(e.getCause() instanceof SizeLimitExceededException)) {
					/* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */
					throw e;
				}
				limitExceeded = true;
			}


			/* create the parsed document with empty text content */
			Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
					null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
			docs[0].setPartiallyParsed(limitExceeded);
			return docs;
		} catch (final Exception e) {
			throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
		}
    }

}
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`// GenericXMLParser.java`
			`// ---------------------------`
			`// Copyright 2017 by luccioman; https://github.com/luccioman`
			`//`
			`// This is a part of YaCy, a peer-to-peer based web search engine`
			`//`
			`// LICENSE`
			`//`
			`// This program is free software; you can redistribute it and/or modify`
			`// it under the terms of the GNU General Public License as published by`
			`// the Free Software Foundation; either version 2 of the License, or`
			`// (at your option) any later version.`
			`//`
			`// This program is distributed in the hope that it will be useful,`
			`// but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`// GNU General Public License for more details.`
			`//`
			`// You should have received a copy of the GNU General Public License`
			`// along with this program; if not, write to the Free Software`
			`// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

			`package net.yacy.document.parser;`

Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`import java.io.IOException;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`import java.io.InputStream;`
			`import java.util.ArrayList;`
			`import java.util.Date;`
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`import java.util.HashSet;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`import java.util.List;`
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`import java.util.Set;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`import javax.naming.SizeLimitExceededException;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`import javax.xml.parsers.ParserConfigurationException;`
			`import javax.xml.parsers.SAXParser;`
			`import javax.xml.parsers.SAXParserFactory;`

Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`import org.apache.commons.fileupload.util.LimitedInputStream;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`import org.apache.commons.io.input.XmlStreamReader;`
			`import org.xml.sax.InputSource;`
			`import org.xml.sax.SAXException;`

			`import net.yacy.cora.document.encoding.UTF8;`
			`import net.yacy.cora.document.id.AnchorURL;`
			`import net.yacy.cora.document.id.DigestURL;`
			`import net.yacy.document.AbstractParser;`
			`import net.yacy.document.Document;`
			`import net.yacy.document.Parser;`
			`import net.yacy.document.VocabularyScraper;`
			`import net.yacy.document.parser.xml.GenericXMLContentHandler;`
			`import net.yacy.kelondro.io.CharBuffer;`
			`import net.yacy.kelondro.util.Formatter;`
			`import net.yacy.kelondro.util.MemoryControl;`

			`/**`
			`* A generic XML parser without knowledge of the specific XML vocabulary.`
			`* @author luccioman`
			`*`
			`*/`
			`public class GenericXMLParser extends AbstractParser implements Parser {`

			`/** SAX parser instance local to each thread */`
			`private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();`

			`/**`
			`* @return a SAXParser instance for the current thread`
			`* @throws SAXException when an error prevented parser creation`
			`*/`
			`private static SAXParser getParser() throws SAXException {`
			`SAXParser parser = tlSax.get();`
			`if (parser == null) {`
			`try {`
			`parser = SAXParserFactory.newInstance().newSAXParser();`
			`} catch (final ParserConfigurationException e) {`
			`throw new SAXException(e.getMessage(), e);`
			`}`
			`tlSax.set(parser);`
			`}`
			`return parser;`
			`}`

			`public GenericXMLParser() {`
			`super("XML Parser");`
			`this.SUPPORTED_EXTENSIONS.add("xml");`
			`this.SUPPORTED_MIME_TYPES.add("application/xml");`
			`this.SUPPORTED_MIME_TYPES.add("text/xml");`
			`}`

			`@Override`
			`public Document[] parse(`
			`final DigestURL location,`
			`final String mimeType,`
			`final String charset,`
			`final VocabularyScraper scraper,`
			`final int timezoneOffset,`
			`final InputStream source)`
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`throws Failure {`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago
			`/* Limit the size of the in-memory buffer to at most 25% of the available memory :`
			`* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.`
			`* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */`
			`final long availableMemory = MemoryControl.available();`
			`final long maxBytes = (long)(availableMemory * 0.25);`
			`final int maxChars;`
			`if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {`
			`maxChars = Integer.MAX_VALUE;`
			`} else {`
			`maxChars = ((int)maxBytes) / Character.BYTES;`
			`}`

			`try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){`

			`/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration`
			`* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */`
			`final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);`
			`final InputSource saxSource = new InputSource(reader);`
			`final String detectedCharset = reader.getEncoding();`

			`final List<AnchorURL> detectedURLs = new ArrayList<>();`

			`final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);`
			`final SAXParser saxParser = getParser();`
			`saxParser.parse(saxSource, saxHandler);`

			`if (writer.isOverflow()) {`
			`throw new Parser.Failure("Not enough Memory available for generic the XML parser : "`
			`+ Formatter.bytesToString(availableMemory), location);`
			`}`

			`/* create the parsed document */`
			`Document[] docs = null;`
			`final byte[] contentBytes = UTF8.getBytes(writer.toString());`
			`docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",`
			`null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };`
			`return docs;`
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago			`} catch(Parser.Failure e) {`
			`throw e;`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago			`} catch (final Exception e) {`
			`throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);`
			`}`

			`}`
Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. 7 years ago
			`@Override`
			`public boolean isParseWithLimitsSupported() {`
			`return true;`
			`}`

			`/**`
			`* {@inheritDoc}`
			`* @param maxBytes the maximum number of content bytes to process. Be careful with to small values :`
			`* a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.`
			`*/`
			`@Override`
			`public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper,`
			`int timezoneOffset, InputStream source, int maxLinks, long maxBytes)`
			`throws Failure, InterruptedException, UnsupportedOperationException {`
			`/* Limit the size of the in-memory buffer to at most 25% of the available memory :`
			`* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.`
			`* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */`
			`final long availableMemory = MemoryControl.available();`
			`final long maxTextBytes = (long)(availableMemory * 0.25);`
			`final int maxChars;`
			`if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {`
			`maxChars = Integer.MAX_VALUE;`
			`} else {`
			`maxChars = ((int)maxTextBytes) / Character.BYTES;`
			`}`

			`try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){`

			`final Set<AnchorURL> detectedURLs = new HashSet<>();`
			`final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);`

			`InputStream limitedSource = new LimitedInputStream(source, maxBytes) {`

			`@Override`
			`protected void raiseError(long pSizeMax, long pCount) throws IOException {`
			`throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes));`

			`}`
			`};`

			`/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration`
			`* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */`
			`final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);`
			`final InputSource saxSource = new InputSource(reader);`
			`final String detectedCharset = reader.getEncoding();`

			`final SAXParser saxParser = getParser();`
			`boolean limitExceeded = false;`
			`try {`
			`saxParser.parse(saxSource, saxHandler);`
			`} catch(SAXException \| IOException e) {`
			`if(!(e.getCause() instanceof SizeLimitExceededException)) {`
			`/* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */`
			`throw e;`
			`}`
			`limitExceeded = true;`
			`}`


			`/* create the parsed document with empty text content */`
			`Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",`
			`null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };`
			`docs[0].setPartiallyParsed(limitExceeded);`
			`return docs;`
			`} catch (final Exception e) {`
			`throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);`
			`}`
			`}`
Added a generic XML parser, able to parse elements text and URLs. This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens. 7 years ago
			`}`