Added partial bzip2 stream parsing support and bzipParser Junit test

8 years ago · 32c9dfa768
parent dd9cb06d25
commit 32c9dfa768
5 changed files with 366 additions and 24 deletions
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -31,8 +31,12 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.MalformedURLException;
 import java.util.Date;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@ -42,15 +46,12 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
 /**
 * Parses a bz2 archive.
 * Unzips and parses the content and adds it to the created main document
 */
 public class bzipParser extends AbstractParser implements Parser {
-
+	
    public bzipParser() {
        super("Bzip 2 UNIX Compressed File Parser");
        this.SUPPORTED_EXTENSIONS.add("bz2");
@ -117,27 +118,8 @@ public class bzipParser extends AbstractParser implements Parser {
        	}
        }
        try {
            final String filename = location.getFileName();
             // create maindoc for this bzip container, register with supplied url & mime
-            maindoc = new Document(
+            maindoc = createMainDocument(location, mimeType, charset, this);
                    location,
                    mimeType,
                    charset,
                    this,
                    null,
                    null,
                    AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
                    null,
                    null,
                    null,
                    null,
                    0.0d, 0.0d,
                    (Object) null,
                    null,
                    null,
                    null,
                    false,
                    new Date());
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
@ -153,4 +135,112 @@ public class bzipParser extends AbstractParser implements Parser {
        }
        return maindoc == null ? null : new Document[]{maindoc};
    }
    @Override
    public boolean isParseWithLimitsSupported() {
    	return true;
    }
    /**
     * Create the main resulting parsed document for a bzip archive
     * @param location the parsed resource URL
     * @param mimeType the media type of the resource
     * @param charset the charset name if known
     * @param an instance of bzipParser that is registered as the parser origin of the document
     * @return a Document instance
     */
 	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
 		final String filename = location.getFileName();
 		Document maindoc = new Document(
                location,
                mimeType,
                charset,
                parser,
                null,
                null,
                AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
                null,
                null,
                null,
                null,
                0.0d, 0.0d,
                (Object) null,
                null,
                null,
                null,
                false,
                new Date());
 		return maindoc;
 	}
 	/**
 	 * Parse content in an open stream uncompressing on the fly a bzipped resource.
 	 * @param location the URL of the bzipped resource 
 	 * @param charset the charset name if known
 	 * @param timezoneOffset the local time zone offset
 	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
 	 * @param maxLinks
 	 *            the maximum total number of links to parse and add to the
 	 *            result documents
 	 * @param maxBytes
 	 *            the maximum number of content bytes to process
 	 * @return a list of documents that result from parsing the source, with
 	 *         empty or null text.
 	 * @throws Parser.Failure
 	 *             when the parser processing failed
 	 */
 	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
 			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
        // creating a new parser class to parse the unzipped content
 		final String compressedFileName = location.getFileName();
        final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
        try {
        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
    		final String locationPath = location.getPath();
        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
 			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
 	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
 	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
 		} catch (MalformedURLException e) {
 			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
 		}
 	}
    @Override
    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
    		throws Parser.Failure {
        Document maindoc = null;
        BZip2CompressorInputStream zippedContent = null;
        try {
            // BZip2CompressorInputStream checks filecontent (magic start-bytes "BZh") and throws ioexception if no match
            zippedContent = new BZip2CompressorInputStream(source);
        } catch(Exception e) {
        	throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
        } 
        try {
             // create maindoc for this bzip container, register with supplied url & mime
            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
            if (docs != null) {
            	maindoc.addSubDocuments(docs);
            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
            		maindoc.setPartiallyParsed(true);
            	}
            }
        } catch (final Exception e) {
            if (e instanceof Parser.Failure) {
            	throw (Parser.Failure) e;
            }
            throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);
        }
        return maindoc == null ? null : new Document[]{maindoc};
    }
 }
--- a/test/java/net/yacy/document/parser/bzipParserTest.java
+++ b/test/java/net/yacy/document/parser/bzipParserTest.java
@ -0,0 +1,252 @@
 // bzipParserTest.java
 // ---------------------------
 // Copyright 2017 by luccioman; https://github.com/luccioman
 //
 // This is a part of YaCy, a peer-to-peer based web search engine
 //
 // LICENSE
 //
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
 // (at your option) any later version.
 //
 // This program is distributed in the hope that it will be useful,
 // but WITHOUT ANY WARRANTY; without even the implied warranty of
 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 // GNU General Public License for more details.
 //
 // You should have received a copy of the GNU General Public License
 // along with this program; if not, write to the Free Software
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 package net.yacy.document.parser;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 import java.util.Collection;
 import org.junit.Test;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.Document;
 import net.yacy.document.Parser.Failure;
 import net.yacy.document.VocabularyScraper;
 /**
 * Unit tests for the {@link bzipParser} class
 * 
 * @author luccioman
 *
 */
 public class bzipParserTest {
 	/** Folder containing test files */
 	private static final File TEST_FOLER = new File("test" + File.separator + "parsertest" + File.separator);
 	/**
 	 * Unit test for the bzipParser.parse() function with some small bzip2 test files.
 	 * 
 	 * @throws Failure
 	 *             when a file could not be parsed
 	 * @throws InterruptedException
 	 *             when the test was interrupted before its termination
 	 * @throws IOException
 	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParse() throws Failure, InterruptedException, IOException {
 		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
 		final bzipParser parser = new bzipParser();
 		for (final String fileName : fileNames) {
 			final DigestURL location = new DigestURL("http://localhost/" + fileName);
 			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 				final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
 						new VocabularyScraper(), 0, inStream);
 				assertNotNull("Parser result must not be null for file " + fileName, documents);
 				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
 				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
 						documents[0].getTextString().contains("Maßkrügen"));
 				final Collection<AnchorURL> anchors = documents[0].getAnchors();
 				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
 				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
 				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
 			}
 		}
 	}
 	/**
 	 * Testing parse integration with the tar parser on a test tbz2 archive.
 	 * 
 	 * @throws Failure
 	 *             when a file could not be parsed
 	 * @throws InterruptedException
 	 *             when the test was interrupted before its termination
 	 * @throws IOException
 	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParseTbz() throws Failure, InterruptedException, IOException {
 		final String fileName = "umlaute_html_xml_txt_gnu.tbz2";
 		final bzipParser parser = new bzipParser();
 		final DigestURL location = new DigestURL("http://localhost/" + fileName);
 		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 			final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
 					new VocabularyScraper(), 0, inStream);
 			assertNotNull("Parser result must not be null for file " + fileName, documents);
 			final String parsedText = documents[0].getTextString();
 			assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
 			assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
 					parsedText.contains("Maßkrügen"));
 			assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
 			assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
 			assertTrue(parsedText.contains("URL reference in raw text file"));
 			assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
 			final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
 					detectedAnchors.size());
 			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
 			assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
 			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
 			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
 			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
 		}
 	}
 	/**
 	 * Unit test for the bzipParser.parseWithLimits() function with some small bz2
 	 * test files which content is within limits.
 	 * 
 	 * @throws Failure
 	 *             when a file could not be parsed
 	 * @throws InterruptedException
 	 *             when the test was interrupted before its termination
 	 * @throws IOException
 	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParseWithLimits() throws Failure, InterruptedException, IOException {
 		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
 		final bzipParser parser = new bzipParser();
 		for (final String fileName : fileNames) {
 			final DigestURL location = new DigestURL("http://localhost/" + fileName);
 			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 				final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
 						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000,
 						10000);
 				assertNotNull("Parser result must not be null for file " + fileName, documents);
 				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
 				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
 						documents[0].getTextString().contains("Maßkrügen"));
 				final Collection<AnchorURL> anchors = documents[0].getAnchors();
 				assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
 				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
 				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
 				assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
 						documents[0].isPartiallyParsed());
 			}
 		}
 	}
 	/**
 	 * Unit test for the bzipParser.parseWithLimits() when maxLinks limit is exceeded
 	 * 
 	 * @throws Failure
 	 *             when a file could not be parsed
 	 * @throws InterruptedException
 	 *             when the test was interrupted before its termination
 	 * @throws IOException
 	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
 		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
 		final bzipParser parser = new bzipParser();
 		/* maxLinks limit exceeded */
 		for (final String fileName : fileNames) {
 			final DigestURL location = new DigestURL("http://localhost/" + fileName);
 			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 				final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
 						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
 				assertNotNull("Parser result must not be null for file " + fileName, documents);
 				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
 				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
 						documents[0].getTextString().contains("Maßkrügen"));
 				final Collection<AnchorURL> anchors = documents[0].getAnchors();
 				assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
 				assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
 						documents[0].isPartiallyParsed());
 			}
 		}
 	}
 	/**
 	 * Unit test for the bzipParser.parseWithLimits() when maxBytes limit is exceeded
 	 * 
 	 * @throws Failure
 	 *             when a file could not be parsed
 	 * @throws InterruptedException
 	 *             when the test was interrupted before its termination
 	 * @throws IOException
 	 *             when a read/write error occurred
 	 */
 	@Test
 	public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
 		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
 		final bzipParser parser = new bzipParser();
 		String fileName = fileNames[0];
 		DigestURL location = new DigestURL("http://localhost/" + fileName);
 		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 			/* The bytes limit is set to let parsing the beginning text part, but stop before reaching the <a> tag */
 			final long maxBytes = 258;
 			final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
 					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
 			assertNotNull("Parser result must not be null for file " + fileName, documents);
 			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
 			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
 					documents[0].getTextString().contains("Maßkrügen"));
 			final Collection<AnchorURL> anchors = documents[0].getAnchors();
 			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
 			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
 					documents[0].isPartiallyParsed());
 		}
 		fileName = fileNames[1];
 		location = new DigestURL("http://localhost/" + fileName);
 		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
 			/* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */
 			final long maxBytes = 65;
 			final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
 					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
 			assertNotNull("Parser result must not be null for file " + fileName, documents);
 			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
 			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
 					documents[0].getTextString().contains("Maßkrügen"));
 			final Collection<AnchorURL> anchors = documents[0].getAnchors();
 			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
 			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
 					documents[0].isPartiallyParsed());
 		}
 	}
 }
--- a/test/parsertest/umlaute_html_utf8.html.bz2
+++ b/test/parsertest/umlaute_html_utf8.html.bz2
--- a/test/parsertest/umlaute_html_xml_txt_gnu.tbz2
+++ b/test/parsertest/umlaute_html_xml_txt_gnu.tbz2
--- a/test/parsertest/umlaute_linux.txt.bz2
+++ b/test/parsertest/umlaute_linux.txt.bz2