From 112ae013f45cac48afab5a54a95d758d7db99c76 Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 7 Nov 2015 19:13:18 +0100 Subject: [PATCH] update bzip and bzip parser process, to return one document for the file with combined parser results of the containing file and registers it with supplied url and mime of the archive. --- .../net/yacy/document/parser/bzipParser.java | 36 +++++++++++++++--- .../net/yacy/document/parser/gzipParser.java | 38 ++++++++++++++++--- 2 files changed, 63 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index fe95e8ab7..0dc0daad6 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -30,6 +30,7 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; @@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2Utils; - +/** + * Parses a bz2 archive. + * Unzips and parses the content and adds it to the created main document + */ public class bzipParser extends AbstractParser implements Parser { public bzipParser() { @@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser { throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser { // creating a temp file to store the uncompressed data final FileOutputStream out = new FileOutputStream(tempFile); - // reading gzip file and store it uncompressed + // reading bzip file and store it uncompressed while((read = zippedContent.read(data, 0, 1024)) != -1) { out.write(data, 0, read); } zippedContent.close(); out.close(); + // create maindoc for this bzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); - docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); - // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding) + final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } } diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 58f788f37..504dd1116 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -30,17 +30,23 @@ package net.yacy.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; +import java.util.Date; import java.util.zip.GZIPInputStream; import net.yacy.cora.document.id.AnchorURL; +import net.yacy.cora.document.id.DigestURL; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.document.TextParser; import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.util.FileUtils; +import org.apache.commons.compress.compressors.gzip.GzipUtils; - +/** + * Parses a gz archive. + * Unzips and parses the content and adds it to the created main document + */ public class gzipParser extends AbstractParser implements Parser { public gzipParser() { @@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser { final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; - Document[] docs = null; + Document maindoc = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser { } zippedContent.close(); out.close(); - + // create maindoc for this gzip container, register with supplied url & mime + maindoc = new Document( + location, + mimeType, + charset, + this, + null, + null, + null, + null, + null, + null, + null, + 0.0d, 0.0d, + (Object) null, + null, + null, + null, + false, + new Date()); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); + final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName()); + final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); + Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); + if (docs != null) maindoc.addSubDocuments(docs); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } - return docs; + return maindoc == null ? null : new Document[]{maindoc}; } }