From e76a90837be0eacef61730886a7dd5d92e1d8d71 Mon Sep 17 00:00:00 2001 From: reger Date: Fri, 6 Nov 2015 23:58:55 +0100 Subject: [PATCH 1/2] update zip and tar parser process, to return one document for the file with combined parser results of the containing files. --- source/net/yacy/document/Document.java | 13 ++++++ .../net/yacy/document/parser/tarParser.java | 41 ++++++++++++++----- .../net/yacy/document/parser/zipParser.java | 40 +++++++++++++----- 3 files changed, 73 insertions(+), 21 deletions(-) diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index a634fb598..cca71e75c 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -671,6 +671,19 @@ dc_rights return v; } + /** + * Adds the main content of subdocuments to this document. + * This is useful if the document is a container for other documents (like zip or other archives) + * to make the content of the subdocuments searcheable, + * but has only one url (unlike container-urls as rss). + * + * This is similar to mergeDocuments but directly joins internal content variables, + * uses less parsed details and keeps this documents crawl data (like crawldepth, lastmodified) + * + * @see mergeDocuments() + * @param docs to be included + * @throws IOException + */ public void addSubDocuments(final Document[] docs) throws IOException { for (final Document doc: docs) { this.sections.addAll(doc.sections); diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index be4b515fd..c5a5fbd03 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -29,8 +29,7 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.RandomAccessFile; -import java.util.ArrayList; -import java.util.List; +import java.util.Date; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.encoding.UTF8; @@ -47,7 +46,10 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; // this is a new implementation of this parser idiom using multiple documents as result set - +/** + * Parses the tar file and each contained file, + * returns one document with combined content. + */ public class tarParser extends AbstractParser implements Parser { private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105 @@ -70,8 +72,6 @@ public class tarParser extends AbstractParser implements Parser { final int timezoneOffset, InputStream source) throws Parser.Failure, InterruptedException { - final List docacc = new ArrayList(); - Document[] subDocs = null; final String ext = MultiProtocolURL.getFileExtension(location.getFileName()); if (ext.equals("gz") || ext.equals("tgz")) { try { @@ -82,11 +82,31 @@ public class tarParser extends AbstractParser implements Parser { } TarArchiveEntry entry; final TarArchiveInputStream tis = new TarArchiveInputStream(source); - File tmp = null; - + + // create maindoc for this bzip container + Document maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // loop through the elements in the tar file and parse every single file inside while (true) { try { + File tmp = null; entry = tis.getNextTarEntry(); if (entry == null) break; if (entry.isDirectory() || entry.getSize() <= 0) continue; @@ -96,9 +116,9 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); + final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); if (subDocs == null) continue; - for (final Document d: subDocs) docacc.add(d); + maindoc.addSubDocuments(subDocs); } catch (final Parser.Failure e) { AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); } finally { @@ -109,8 +129,7 @@ public class tarParser extends AbstractParser implements Parser { break; } } - if (docacc.isEmpty()) return null; - return docacc.toArray(new Document[docacc.size()]); + return new Document[]{maindoc}; } public final static boolean isTar(File f) { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index a924a6e03..155d669ba 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -27,8 +27,7 @@ package net.yacy.document.parser; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; +import java.util.Date; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -43,7 +42,11 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; // this is a new implementation of this parser idiom using multiple documents as result set - +/** + * Parses Zip archives. Creates a main document for the zip url/file. + * Each file in the zip is parsed and the result added to the main document. + * parse returns one document with the combined content. + */ public class zipParser extends AbstractParser implements Parser { public zipParser() { @@ -74,15 +77,33 @@ public class zipParser extends AbstractParser implements Parser { if (!MemoryControl.request(200 * 1024 * 1024, false)) throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location); - Document[] docs = null; - final List docacc = new ArrayList(); ZipEntry entry; final ZipInputStream zis = new ZipInputStream(source); - File tmp = null; + // create maindoc for this zip container with supplied url and mime + Document maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object)null, + null, + null, + null, + false, + new Date()); // loop through the elements in the zip file and parse every single file inside while (true) { try { + File tmp = null; if (zis.available() <= 0) break; entry = zis.getNextEntry(); if (entry == null) break; @@ -95,9 +116,9 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); + final Document[] docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); if (docs == null) continue; - for (final Document d: docs) docacc.add(d); + maindoc.addSubDocuments(docs); } catch (final Parser.Failure e) { AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage()); } finally { @@ -108,7 +129,6 @@ public class zipParser extends AbstractParser implements Parser { break; } } - if (docacc.isEmpty()) return null; - return docacc.toArray(new Document[docacc.size()]); + return new Document[]{maindoc}; } } From 112ae013f45cac48afab5a54a95d758d7db99c76 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 7 Nov 2015 19:13:18 +0100 Subject: [PATCH 2/2] update bzip and bzip parser process, to return one document for the file with combined parser results of the containing file and registers it with supplied url and mime of the archive. --- .../net/yacy/document/parser/bzipParser.java | 36 +++++++++++++++--- .../net/yacy/document/parser/gzipParser.java | 38 ++++++++++++++++--- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index fe95e8ab7..0dc0daad6 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -30,6 +30,7 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; @@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2Utils; - +/** + * Parses a bz2 archive. + * Unzips and parses the content and adds it to the created main document + */ public class bzipParser extends AbstractParser implements Parser { public bzipParser() { @@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser { throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser { // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - // reading gzip file and store it uncompressed + // reading bzip file and store it uncompressed while((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); + // create maindoc for this bzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); - docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); - // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding) + final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } } diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 58f788f37..504dd1116 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -30,17 +30,23 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; +import org.apache.commons.compress.compressors.gzip.GzipUtils; - +/** + * Parses a gz archive. + * Unzips and parses the content and adds it to the created main document + */ public class gzipParser extends AbstractParser implements Parser { public gzipParser() { @@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser { final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs = null; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser { } zippedContent.close(); out.close(); - + // create maindoc for this gzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); + final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); + final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); + Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } }