Added partial bzip2 stream parsing support and bzipParser Junit test

8 years ago · 32c9dfa768
parent dd9cb06d25
commit 32c9dfa768
5 changed files with 366 additions and 24 deletions
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -31,8 +31,12 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.MalformedURLException;
 import java.util.Date;

+import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
+import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
+
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.document.AbstractParser;
@ -42,15 +46,12 @@ import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;

-import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
-import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
-
 /**
 * Parses a bz2 archive.
 * Unzips and parses the content and adds it to the created main document
 */
 public class bzipParser extends AbstractParser implements Parser {
-
+	
    public bzipParser() {
        super("Bzip 2 UNIX Compressed File Parser");
        this.SUPPORTED_EXTENSIONS.add("bz2");
@ -117,27 +118,8 @@ public class bzipParser extends AbstractParser implements Parser {
        	}
        }
        try {
-            final String filename = location.getFileName();
             // create maindoc for this bzip container, register with supplied url & mime
-            maindoc = new Document(
-                    location,
-                    mimeType,
-                    charset,
-                    this,
-                    null,
-                    null,
-                    AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
-                    null,
-                    null,
-                    null,
-                    null,
-                    0.0d, 0.0d,
-                    (Object) null,
-                    null,
-                    null,
-                    null,
-                    false,
-                    new Date());
+            maindoc = createMainDocument(location, mimeType, charset, this);
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
@ -153,4 +135,112 @@ public class bzipParser extends AbstractParser implements Parser {
        }
        return maindoc == null ? null : new Document[]{maindoc};
    }
+    
+    @Override
+    public boolean isParseWithLimitsSupported() {
+    	return true;
+    }
+    
+    /**
+     * Create the main resulting parsed document for a bzip archive
+     * @param location the parsed resource URL
+     * @param mimeType the media type of the resource
+     * @param charset the charset name if known
+     * @param an instance of bzipParser that is registered as the parser origin of the document
+     * @return a Document instance
+     */
+	public static Document createMainDocument(final DigestURL location, final String mimeType, final String charset, final bzipParser parser) {
+		final String filename = location.getFileName();
+		Document maindoc = new Document(
+                location,
+                mimeType,
+                charset,
+                parser,
+                null,
+                null,
+                AbstractParser.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
+                null,
+                null,
+                null,
+                null,
+                0.0d, 0.0d,
+                (Object) null,
+                null,
+                null,
+                null,
+                false,
+                new Date());
+		return maindoc;
+	}
+	
+	/**
+	 * Parse content in an open stream uncompressing on the fly a bzipped resource.
+	 * @param location the URL of the bzipped resource 
+	 * @param charset the charset name if known
+	 * @param timezoneOffset the local time zone offset
+	 * @param compressedInStream an open stream uncompressing on the fly the compressed content
+	 * @param maxLinks
+	 *            the maximum total number of links to parse and add to the
+	 *            result documents
+	 * @param maxBytes
+	 *            the maximum number of content bytes to process
+	 * @return a list of documents that result from parsing the source, with
+	 *         empty or null text.
+	 * @throws Parser.Failure
+	 *             when the parser processing failed
+	 */
+	public Document[] parseCompressedInputStream(final DigestURL location, final String charset, final int timezoneOffset, final int depth,
+			final InputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
+        // creating a new parser class to parse the unzipped content
+		final String compressedFileName = location.getFileName();
+        final String contentfilename = BZip2Utils.getUncompressedFilename(compressedFileName);
+        final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
+        try {
+        	/* Use the uncompressed file name for sub parsers to not unnecessarily use again the gzipparser */
+    		final String locationPath = location.getPath();
+        	final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length()) + contentfilename;
+			final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(), location.getPort(), contentPath);
+			
+	        /* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on compressed content */
+	        return TextParser.parseWithLimits(contentLocation, mime, charset, timezoneOffset, depth, -1, compressedInStream, maxLinks, maxBytes);
+		} catch (MalformedURLException e) {
+			throw new Parser.Failure("Unexpected error while parsing gzip file. " + e.getMessage(), location);
+		}
+	}
+		
+    
+    @Override
+    public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
+    		final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
+    		throws Parser.Failure {
+        Document maindoc = null;
+        BZip2CompressorInputStream zippedContent = null;
+        try {
+            // BZip2CompressorInputStream checks filecontent (magic start-bytes "BZh") and throws ioexception if no match
+            zippedContent = new BZip2CompressorInputStream(source);
+
+        } catch(Exception e) {
+        	throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(), location);
+        } 
+        
+        try {
+             // create maindoc for this bzip container, register with supplied url & mime
+            maindoc = createMainDocument(location, mimeType, charset, this);
+            // creating a new parser class to parse the unzipped content
+            final Document[] docs = parseCompressedInputStream(location, null, timezoneOffset, 999, zippedContent, maxLinks, maxBytes);
+            if (docs != null) {
+            	maindoc.addSubDocuments(docs);
+            	if(docs.length > 0 && docs[0].isPartiallyParsed()) {
+            		maindoc.setPartiallyParsed(true);
+            	}
+            }
+        } catch (final Exception e) {
+            if (e instanceof Parser.Failure) {
+            	throw (Parser.Failure) e;
+            }
+
+            throw new Parser.Failure("Unexpected error while parsing bzip file. " + e.getMessage(),location);
+        }
+        return maindoc == null ? null : new Document[]{maindoc};
+    }
 }
--- a/test/java/net/yacy/document/parser/bzipParserTest.java
+++ b/test/java/net/yacy/document/parser/bzipParserTest.java
@ -0,0 +1,252 @@
+// bzipParserTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
+
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.Document;
+import net.yacy.document.Parser.Failure;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link bzipParser} class
+ * 
+ * @author luccioman
+ *
+ */
+public class bzipParserTest {
+	
+	/** Folder containing test files */
+	private static final File TEST_FOLER = new File("test" + File.separator + "parsertest" + File.separator);
+
+	/**
+	 * Unit test for the bzipParser.parse() function with some small bzip2 test files.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParse() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
+		final bzipParser parser = new bzipParser();
+
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
+						new VocabularyScraper(), 0, inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+			}
+		}
+	}
+
+	/**
+	 * Testing parse integration with the tar parser on a test tbz2 archive.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseTbz() throws Failure, InterruptedException, IOException {
+		final String fileName = "umlaute_html_xml_txt_gnu.tbz2";
+		final bzipParser parser = new bzipParser();
+
+		
+		final DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			final Document[] documents = parser.parse(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream);
+			
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+
+			final String parsedText = documents[0].getTextString();
+			assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
+			assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+					parsedText.contains("Maßkrügen"));
+			assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
+			assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
+			assertTrue(parsedText.contains("URL reference in raw text file"));
+			assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
+
+			final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
+					detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
+		}
+	}
+
+	/**
+	 * Unit test for the bzipParser.parseWithLimits() function with some small bz2
+	 * test files which content is within limits.
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimits() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
+		final bzipParser parser = new bzipParser();
+
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000,
+						10000);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
+				assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
+				assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
+				assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			}
+		}
+
+	}
+	
+	/**
+	 * Unit test for the bzipParser.parseWithLimits() when maxLinks limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
+		final bzipParser parser = new bzipParser();
+
+		/* maxLinks limit exceeded */
+		for (final String fileName : fileNames) {
+			final DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+				final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2",
+						StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+				final Collection<AnchorURL> anchors = documents[0].getAnchors();
+				assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+				assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+						documents[0].isPartiallyParsed());
+			}
+		}
+	}
+	
+	/**
+	 * Unit test for the bzipParser.parseWithLimits() when maxBytes limit is exceeded
+	 * 
+	 * @throws Failure
+	 *             when a file could not be parsed
+	 * @throws InterruptedException
+	 *             when the test was interrupted before its termination
+	 * @throws IOException
+	 *             when a read/write error occurred
+	 */
+	@Test
+	public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
+		final String[] fileNames = { "umlaute_html_utf8.html.bz2", "umlaute_linux.txt.bz2" };
+		final bzipParser parser = new bzipParser();
+
+		String fileName = fileNames[0];
+		
+		DigestURL location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			/* The bytes limit is set to let parsing the beginning text part, but stop before reaching the <a> tag */
+			final long maxBytes = 258;
+			final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			final Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		}
+
+		fileName = fileNames[1];
+		location = new DigestURL("http://localhost/" + fileName);
+		try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
+			/* The bytes limit is set to let parsing the beginning of the text, but stop before reaching the URL */
+			final long maxBytes = 65;
+			final Document[] documents = parser.parseWithLimits(location, "application/x-bzip2", StandardCharsets.UTF_8.name(),
+					new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
+			assertNotNull("Parser result must not be null for file " + fileName, documents);
+			assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+			assertTrue("Parsed text must contain test word with umlaut char" + fileName,
+					documents[0].getTextString().contains("Maßkrügen"));
+			final Collection<AnchorURL> anchors = documents[0].getAnchors();
+			assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
+			assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
+					documents[0].isPartiallyParsed());
+		}
+	}
+
+}
--- a/test/parsertest/umlaute_html_utf8.html.bz2
+++ b/test/parsertest/umlaute_html_utf8.html.bz2
--- a/test/parsertest/umlaute_html_xml_txt_gnu.tbz2
+++ b/test/parsertest/umlaute_html_xml_txt_gnu.tbz2
--- a/test/parsertest/umlaute_linux.txt.bz2
+++ b/test/parsertest/umlaute_linux.txt.bz2