Added a parser for XZ compressed archives.

As suggested by LA_FORGE on mantis 781
(http://mantis.tokeek.de/view.php?id=781)
pull/218/head
luccioman 6 years ago
parent 8ce9c066bf
commit 685122363d

@ -23,6 +23,7 @@
<classpathentry kind="lib" path="lib/json-simple-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/xml-apis.jar"/>
<classpathentry kind="lib" path="lib/commons-compress-1.17.jar"/>
<classpathentry kind="lib" path="lib/xz-1.8.jar"/>
<classpathentry kind="lib" path="lib/commons-lang-2.6.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.11.jar"/>
<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.7.25.jar"/>

@ -179,6 +179,7 @@
<pathelement location="${lib}/common-lang-3.3.2.jar" />
<pathelement location="${lib}/commons-codec-1.11.jar" />
<pathelement location="${lib}/commons-compress-1.17.jar" />
<pathelement location="${lib}/xz-1.8.jar" />
<pathelement location="${lib}/commons-fileupload-1.3.3.jar" />
<pathelement location="${lib}/commons-io-2.6.jar" />
<pathelement location="${lib}/commons-jxpath-1.3.jar" />

@ -0,0 +1,10 @@
Licensing of XZ for Java
========================
All the files in this package have been written by Lasse Collin
and/or Igor Pavlov. All these files have been put into the
public domain. You can do whatever you want with these files.
This software is provided "as is", without any warranty.

Binary file not shown.

@ -380,6 +380,12 @@
<version>1.17</version>
<type>jar</type>
</dependency>
<dependency>
<!-- Handle XZ compressed archives. It is an optional dependency of commons-compress. -->
<groupId>org.tukaani</groupId>
<artifactId>xz</artifactId>
<version>1.8</version>
</dependency>
<dependency>
<groupId>commons-fileupload</groupId>
<artifactId>commons-fileupload</artifactId>

@ -42,6 +42,7 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.StrictLimitInputStream;
import net.yacy.document.parser.GenericXMLParser;
import net.yacy.document.parser.XZParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@ -93,6 +94,7 @@ public final class TextParser {
static {
initParser(new apkParser());
initParser(new bzipParser());
initParser(new XZParser());
initParser(new csvParser());
initParser(new docParser());
initParser(new gzipParser());
@ -380,6 +382,32 @@ public final class TextParser {
Integer.MAX_VALUE, Long.MAX_VALUE);
}
/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
* @param location the URL of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known
* @param ignoreClassNames an eventual set of CSS class names whose matching html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param depth the current depth of the crawl
* @param contentLength the length of the source, if known (else -1 should be used)
* @param source a input stream
* @param maxLinks the maximum total number of links to parse and add to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, final Set<String> ignoreClassNames,
final int timezoneOffset, final int depth, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, ignoreClassNames, new VocabularyScraper(), timezoneOffset, depth, contentLength,
sourceStream, maxLinks, maxBytes);
}
/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits

@ -0,0 +1,187 @@
// AbstractCompressorParser.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.Set;
import org.apache.commons.compress.compressors.CompressorInputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper;
/**
* Base class for parsing compressed files relying on Apache commons-compress
* tools.
*/
public abstract class AbstractCompressorParser extends AbstractParser implements Parser {
/** Crawl depth applied when parsing internal compressed content */
protected static final int DEFAULT_DEPTH = 999;
/**
* @param name the human readable name of the parser
*/
public AbstractCompressorParser(final String name) {
super(name);
}
/**
* @param source an open input stream on a compressed source
* @return a sub class of CompressorInputStream capable of uncompressing the source
* on the fly
* @throws IOException when an error occurred when trying to open the compressed
* stream
*/
protected abstract CompressorInputStream createDecompressStream(final InputStream source) throws IOException;
/**
* Maps the given name of a compressed file to the name that the
* file should have after uncompression. For example, for "file.txt.xz", "file.txt" is returned.
*
* @param filename name of a compressed file
* @return name of the corresponding uncompressed file
*/
protected abstract String getUncompressedFilename(final String filename);
@Override
public Document[] parse(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
return parseWithLimits(location, mimeType, charset, scraper, timezoneOffset, source, Integer.MAX_VALUE,
Long.MAX_VALUE);
}
@Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset,
final Set<String> ignoreClassNames, final VocabularyScraper scraper, final int timezoneOffset,
final InputStream source, final int maxLinks, final long maxBytes) throws Parser.Failure {
Document maindoc;
final CompressorInputStream compressedInStream;
try {
compressedInStream = createDecompressStream(source);
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
try {
// create maindoc for this archive, register with supplied url & mime
maindoc = AbstractCompressorParser.createMainDocument(location, mimeType, charset, this);
final Document[] docs = this.parseCompressedInputStream(location, null, ignoreClassNames, timezoneOffset,
AbstractCompressorParser.DEFAULT_DEPTH, compressedInStream, maxLinks, maxBytes);
if (docs != null) {
maindoc.addSubDocuments(docs);
if (docs.length > 0 && docs[0].isPartiallyParsed()) {
maindoc.setPartiallyParsed(true);
}
}
} catch (final Parser.Failure e) {
throw e;
} catch (final IOException | RuntimeException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
return new Document[] { maindoc };
}
/**
* Create the main parsed document for the compressed document at the given URL
* and Media type
*
* @param location the parsed resource URL
* @param mimeType the media type of the resource
* @param charset the charset name if known
* @param parser an instance of CompressorParser that is registered as the
* parser origin of the document
* @return a Document instance
*/
protected static Document createMainDocument(final DigestURL location, final String mimeType, final String charset,
final AbstractCompressorParser parser) {
final String filename = location.getFileName();
return new Document(location, mimeType, charset, parser, null, null,
AbstractParser
.singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename)), // title
null, null, null, null, 0.0d, 0.0d, (Object) null, null, null, null, false, new Date());
}
/**
* Parse content in an open stream uncompressing on the fly a compressed
* resource.
*
* @param location the URL of the compressed resource
* @param charset the charset name if known
* @param ignoreClassNames an eventual set of CSS class names whose matching
* html elements content should be ignored
* @param timezoneOffset the local time zone offset
* @param compressedInStream an open stream uncompressing on the fly the
* compressed content
* @param maxLinks the maximum total number of links to parse and add
* to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty
* or null text.
* @throws Parser.Failure when the parser processing failed
*/
protected Document[] parseCompressedInputStream(final DigestURL location, final String charset,
final Set<String> ignoreClassNames, final int timezoneOffset, final int depth,
final CompressorInputStream compressedInStream, final int maxLinks, final long maxBytes) throws Failure {
final String compressedFileName = location.getFileName();
final String contentfilename = getUncompressedFilename(compressedFileName);
final String mime = TextParser.mimeOf(MultiProtocolURL.getFileExtension(contentfilename));
try {
/*
* Use the uncompressed file name for sub parsers to not unnecessarily use again
* this same uncompressing parser
*/
final String locationPath = location.getPath();
final String contentPath = locationPath.substring(0, locationPath.length() - compressedFileName.length())
+ contentfilename;
final DigestURL contentLocation = new DigestURL(location.getProtocol(), location.getHost(),
location.getPort(), contentPath);
/*
* Rely on the supporting parsers to respect the maxLinks and maxBytes limits on
* compressed content
*/
return TextParser.parseWithLimits(contentLocation, mime, charset, ignoreClassNames, timezoneOffset, depth,
-1, compressedInStream, maxLinks, maxBytes);
} catch (final MalformedURLException e) {
throw new Parser.Failure("Unexpected error while parsing compressed file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
}

@ -0,0 +1,66 @@
// XZParser.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.compress.compressors.CompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.commons.compress.compressors.xz.XZUtils;
import net.yacy.kelondro.util.MemoryControl;
/**
* Parser for xz archives. Uncompresses and parses the content and adds it to
* the created main parsed document.
*
* @see <a href="https://tukaani.org/xz/format.html">xz file format website</a>
*/
public class XZParser extends AbstractCompressorParser {
public XZParser() {
super("XZ Compressed Archive Parser");
this.SUPPORTED_EXTENSIONS.add("xz");
this.SUPPORTED_EXTENSIONS.add("txz");
this.SUPPORTED_MIME_TYPES.add("application/x-xz");
}
@Override
protected CompressorInputStream createDecompressStream(final InputStream source) throws IOException {
/*
* Limit the size dedicated to reading compressed blocks to at most 25% of the
* available memory. Eventual stricter limits should be handled by the caller
* (see for example crawler.[protocol].maxFileSize configuration setting).
*/
final long availableMemory = MemoryControl.available();
final long maxKBytes = (long) (availableMemory * 0.25 / 1024.0);
return new XZCompressorInputStream(source, false, (int) Math.min(Integer.MAX_VALUE, maxKBytes));
}
@Override
protected String getUncompressedFilename(final String filename) {
return XZUtils.getUncompressedFilename(filename);
}
}

@ -0,0 +1,246 @@
// XZParserTest.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertEquals;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import org.junit.Test;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.VocabularyScraper;
/**
* Unit tests for the {@link XZParser} class
*
* @author luccioman
*
*/
public class XZParserTest {
/** Folder containing test files */
private static final File TEST_FOLER = new File("test", "parsertest");
/**
* Unit test for the XZParser.parse() function with some small bzip2 test files.
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParse() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
final XZParser parser = new XZParser();
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
}
}
}
/**
* Testing parse integration with the tar parser on a test txz archive.
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseTxz() throws Failure, InterruptedException, IOException {
final String fileName = "umlaute_html_xml_txt_gnu.txz";
final XZParser parser = new XZParser();
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parse(location, "application/x-xz", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
final String parsedText = documents[0].getTextString();
assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
parsedText.contains("Maßkrügen"));
assertTrue(parsedText.contains("Example link in ISO-8859-1 encoded HTML"));
assertTrue(parsedText.contains("Example link in UTF-8 encoded HTML"));
assertTrue(parsedText.contains("URL reference in raw text file"));
assertTrue(parsedText.contains("UTF-8 encoded XML test file"));
final Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals("Parsed URLs must contains all URLs from each test file included in the archive", 5,
detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/02/22-rdf-syntax-ns#")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://purl.org/dc/elements/1.1/")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_iso.html")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_html_utf8.html")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://localhost/umlaute_linux.txt")));
}
}
/**
* Unit test for the XZParser.parseWithLimits() function with some small xz test
* files which content is within limits.
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseWithLimits() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
final XZParser parser = new XZParser();
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 10000, 10000);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLs must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().startsWith("http://localhost/umlaute_"));
assertFalse("Parse document must not be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}
/**
* Unit test for the XZParser.parseWithLimits() when maxLinks limit is exceeded
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseWithLimitsLinksExceeded() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
final XZParser parser = new XZParser();
/* maxLinks limit exceeded */
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, 0, Long.MAX_VALUE);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}
/**
* Unit test for the XZParser.parseWithLimits() when maxBytes limit is exceeded
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseWithLimitsBytesExceeded() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_html_utf8.html.xz", "umlaute_linux.txt.xz" };
final XZParser parser = new XZParser();
String fileName = fileNames[0];
DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
/*
* The bytes limit is set to let parsing the beginning text part, but stop
* before reaching the <a> tag
*/
final long maxBytes = 258;
final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
fileName = fileNames[1];
location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLER, fileName));) {
/*
* The bytes limit is set to let parsing the beginning of the text, but stop
* before reaching the URL
*/
final long maxBytes = 65;
final Document[] documents = parser.parseWithLimits(location, "application/x-xz",
StandardCharsets.UTF_8.name(), new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, maxBytes);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertTrue("Detected URLs must be empty for file " + fileName, anchors == null || anchors.isEmpty());
assertTrue("Parsed document must be marked as partially parsed for file " + fileName,
documents[0].isPartiallyParsed());
}
}
}
Loading…
Cancel
Save