update bzip and bzip parser process,

to return one document for the file with combined parser results of the containing file and registers it with supplied url and mime of the archive.
9 years ago · 112ae013f4
parent e76a90837b
commit 112ae013f4
2 changed files with 63 additions and 11 deletions
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -30,6 +30,7 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;

 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
 import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import org.apache.commons.compress.compressors.bzip2.BZip2Utils;

-
+/**
+ * Parses a bz2 archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class bzipParser extends AbstractParser implements Parser {

    public bzipParser() {
@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser {
            throws Parser.Failure, InterruptedException {

        File tempFile = null;
-        Document[] docs;
+        Document maindoc = null;
        try {
            int read = 0;
            final byte[] data = new byte[1024];
@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser {
            // creating a temp file to store the uncompressed data
            final FileOutputStream out = new FileOutputStream(tempFile);

-            // reading gzip file and store it uncompressed
+            // reading bzip file and store it uncompressed
            while((read = zippedContent.read(data, 0, 1024)) != -1) {
                out.write(data, 0, read);
            }
            zippedContent.close();
            out.close();

+             // create maindoc for this bzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
            // creating a new parser class to parse the unzipped content
            final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
            final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
-            docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
-            // TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding)
+            final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
    }
 }
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -30,17 +30,23 @@ package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.util.Date;
 import java.util.zip.GZIPInputStream;

 import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.TextParser;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.kelondro.util.FileUtils;
+import org.apache.commons.compress.compressors.gzip.GzipUtils;

-
+/**
+ * Parses a gz archive.
+ * Unzips and parses the content and adds it to the created main document
+ */
 public class gzipParser extends AbstractParser implements Parser {

    public gzipParser() {
@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser {
            final InputStream source) throws Parser.Failure, InterruptedException {

        File tempFile = null;
-        Document[] docs = null;
+        Document maindoc = null;
        try {
            int read = 0;
            final byte[] data = new byte[1024];
@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser {
            }
            zippedContent.close();
            out.close();
-
+            // create maindoc for this gzip container, register with supplied url & mime
+            maindoc = new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    this,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    null,
+                    0.0d, 0.0d,
+                    (Object) null,
+                    null,
+                    null,
+                    null,
+                    false,
+                    new Date());
            // creating a new parser class to parse the unzipped content
-            docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile);
+            final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
+            final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
+            Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
+            if (docs != null) maindoc.addSubDocuments(docs);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
-        return docs;
+        return maindoc == null ? null : new Document[]{maindoc};
    }

 }