From 112ae013f45cac48afab5a54a95d758d7db99c76 Mon Sep 17 00:00:00 2001
From: reger <reger18@arcor.de>
Date: Sat, 7 Nov 2015 19:13:18 +0100
Subject: [PATCH] update bzip and bzip parser process, to return one document
 for the file with combined parser results of the containing file and
 registers it with supplied url and mime of the archive.

---
 .../net/yacy/document/parser/bzipParser.java  | 36 +++++++++++++++---
 .../net/yacy/document/parser/gzipParser.java  | 38 ++++++++++++++++---
 2 files changed, 63 insertions(+), 11 deletions(-)

diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java
index fe95e8ab7..0dc0daad6 100644
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@@ -30,6 +30,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;
 
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
 
-
+/**
+ * Parses a bz2 archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class bzipParser extends AbstractParser implements Parser {
 
     public bzipParser() {
@@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser {
             throws Parser.Failure, InterruptedException {
 
         File tempFile = null;
-        Document[] docs;
+        Document maindoc = null;
         try {
             int read = 0;
             final byte[] data = new byte[1024];
@@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser {
             // creating a temp file to store the uncompressed data
             final FileOutputStream out = new FileOutputStream(tempFile);
 
-            // reading gzip file and store it uncompressed
+            // reading bzip file and store it uncompressed
             while((read = zippedContent.read(data, 0, 1024)) != -1) {
                 out.write(data, 0, read);
             }
             zippedContent.close();
             out.close();
 
+             // create maindoc for this bzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
             // creating a new parser class to parse the unzipped content
             final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
             final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
-            docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
-            // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding)
+            final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
         } catch (final Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser {
         } finally {
             if (tempFile != null) FileUtils.deletedelete(tempFile);
         }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
     }
 }
diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java
index 58f788f37..504dd1116 100644
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@@ -30,17 +30,23 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;
 import java.util.zip.GZIPInputStream;
 
 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;
 
-
+/**
+ * Parses a gz archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class gzipParser extends AbstractParser implements Parser {
 
     public gzipParser() {
@@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser {
             final InputStream source) throws Parser.Failure, InterruptedException {
 
         File tempFile = null;
-        Document[] docs = null;
+        Document maindoc = null;
         try {
             int read = 0;
             final byte[] data = new byte[1024];
@@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser {
             }
             zippedContent.close();
             out.close();
-
+            // create maindoc for this gzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
             // creating a new parser class to parse the unzipped content
-            docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
+            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
+            final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
+            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
         } catch (final Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser {
         } finally {
             if (tempFile != null) FileUtils.deletedelete(tempFile);
         }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
     }
 
 }