update bzip and bzip parser process,

to return one document for the file with combined parser results of the
containing file and registers it with supplied url and mime of the archive.
pull/23/head
reger 9 years ago
parent e76a90837b
commit 112ae013f4

@ -30,6 +30,7 @@ package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.Date;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
@ -43,7 +44,10 @@ import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import org.apache.commons.compress.compressors.bzip2.BZip2Utils; import org.apache.commons.compress.compressors.bzip2.BZip2Utils;
/**
* Parses a bz2 archive.
* Unzips and parses the content and adds it to the created main document
*/
public class bzipParser extends AbstractParser implements Parser { public class bzipParser extends AbstractParser implements Parser {
public bzipParser() { public bzipParser() {
@ -69,7 +73,7 @@ public class bzipParser extends AbstractParser implements Parser {
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs; Document maindoc = null;
try { try {
int read = 0; int read = 0;
final byte[] data = new byte[1024]; final byte[] data = new byte[1024];
@ -82,18 +86,38 @@ public class bzipParser extends AbstractParser implements Parser {
// creating a temp file to store the uncompressed data // creating a temp file to store the uncompressed data
final FileOutputStream out = new FileOutputStream(tempFile); final FileOutputStream out = new FileOutputStream(tempFile);
// reading gzip file and store it uncompressed // reading bzip file and store it uncompressed
while((read = zippedContent.read(data, 0, 1024)) != -1) { while((read = zippedContent.read(data, 0, 1024)) != -1) {
out.write(data, 0, read); out.write(data, 0, read);
} }
zippedContent.close(); zippedContent.close();
out.close(); out.close();
// create maindoc for this bzip container, register with supplied url & mime
maindoc = new Document(
location,
mimeType,
charset,
this,
null,
null,
null,
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName()); final String contentfilename = BZip2Utils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename)); final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile); final Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
// TODO: this could return null from content parsing, even if bz2 successful read (see zipParser for alternative coding) if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -102,6 +126,6 @@ public class bzipParser extends AbstractParser implements Parser {
} finally { } finally {
if (tempFile != null) FileUtils.deletedelete(tempFile); if (tempFile != null) FileUtils.deletedelete(tempFile);
} }
return docs; return maindoc == null ? null : new Document[]{maindoc};
} }
} }

@ -30,17 +30,23 @@ package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream; import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.util.Date;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.VocabularyScraper; import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
/**
* Parses a gz archive.
* Unzips and parses the content and adds it to the created main document
*/
public class gzipParser extends AbstractParser implements Parser { public class gzipParser extends AbstractParser implements Parser {
public gzipParser() { public gzipParser() {
@ -65,7 +71,7 @@ public class gzipParser extends AbstractParser implements Parser {
final InputStream source) throws Parser.Failure, InterruptedException { final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs = null; Document maindoc = null;
try { try {
int read = 0; int read = 0;
final byte[] data = new byte[1024]; final byte[] data = new byte[1024];
@ -84,9 +90,31 @@ public class gzipParser extends AbstractParser implements Parser {
} }
zippedContent.close(); zippedContent.close();
out.close(); out.close();
// create maindoc for this gzip container, register with supplied url & mime
maindoc = new Document(
location,
mimeType,
charset,
this,
null,
null,
null,
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, scraper, timezoneOffset, 999, tempFile); final String contentfilename = GzipUtils.getUncompressedFilename(location.getFileName());
final String mime = TextParser.mimeOf(DigestURL.getFileExtension(contentfilename));
Document[] docs = TextParser.parseSource(location, mime, null, scraper, timezoneOffset, 999, tempFile);
if (docs != null) maindoc.addSubDocuments(docs);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -95,7 +123,7 @@ public class gzipParser extends AbstractParser implements Parser {
} finally { } finally {
if (tempFile != null) FileUtils.deletedelete(tempFile); if (tempFile != null) FileUtils.deletedelete(tempFile);
} }
return docs; return maindoc == null ? null : new Document[]{maindoc};
} }
} }

Loading…
Cancel
Save