update zip and tar parser process,

to return one document for the file with combined parser results of the
containing files.
pull/23/head
reger 9 years ago
parent 8532565c7d
commit e76a90837b

@ -671,6 +671,19 @@ dc_rights
return v; return v;
} }
/**
* Adds the main content of subdocuments to this document.
* This is useful if the document is a container for other documents (like zip or other archives)
* to make the content of the subdocuments searcheable,
* but has only one url (unlike container-urls as rss).
*
* This is similar to mergeDocuments but directly joins internal content variables,
* uses less parsed details and keeps this documents crawl data (like crawldepth, lastmodified)
*
* @see mergeDocuments()
* @param docs to be included
* @throws IOException
*/
public void addSubDocuments(final Document[] docs) throws IOException { public void addSubDocuments(final Document[] docs) throws IOException {
for (final Document doc: docs) { for (final Document doc: docs) {
this.sections.addAll(doc.sections); this.sections.addAll(doc.sections);

@ -29,8 +29,7 @@ import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.util.ArrayList; import java.util.Date;
import java.util.List;
import java.util.zip.GZIPInputStream; import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
@ -47,7 +46,10 @@ import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
// this is a new implementation of this parser idiom using multiple documents as result set // this is a new implementation of this parser idiom using multiple documents as result set
/**
* Parses the tar file and each contained file,
* returns one document with combined content.
*/
public class tarParser extends AbstractParser implements Parser { public class tarParser extends AbstractParser implements Parser {
private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105 private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105
@ -70,8 +72,6 @@ public class tarParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
InputStream source) throws Parser.Failure, InterruptedException { InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
final String ext = MultiProtocolURL.getFileExtension(location.getFileName()); final String ext = MultiProtocolURL.getFileExtension(location.getFileName());
if (ext.equals("gz") || ext.equals("tgz")) { if (ext.equals("gz") || ext.equals("tgz")) {
try { try {
@ -82,11 +82,31 @@ public class tarParser extends AbstractParser implements Parser {
} }
TarArchiveEntry entry; TarArchiveEntry entry;
final TarArchiveInputStream tis = new TarArchiveInputStream(source); final TarArchiveInputStream tis = new TarArchiveInputStream(source);
File tmp = null;
// create maindoc for this bzip container
Document maindoc = new Document(
location,
mimeType,
charset,
this,
null,
null,
null,
null,
null,
null,
null,
0.0d, 0.0d,
(Object) null,
null,
null,
null,
false,
new Date());
// loop through the elements in the tar file and parse every single file inside // loop through the elements in the tar file and parse every single file inside
while (true) { while (true) {
try { try {
File tmp = null;
entry = tis.getNextTarEntry(); entry = tis.getNextTarEntry();
if (entry == null) break; if (entry == null) break;
if (entry.isDirectory() || entry.getSize() <= 0) continue; if (entry.isDirectory() || entry.getSize() <= 0) continue;
@ -96,9 +116,9 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp); final Document[] subDocs = TextParser.parseSource(AnchorURL.newAnchor(location, "#" + name), mime, null, scraper, timezoneOffset, 999, tmp);
if (subDocs == null) continue; if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d); maindoc.addSubDocuments(subDocs);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage()); AbstractParser.log.warn("tar parser entry " + name + ": " + e.getMessage());
} finally { } finally {
@ -109,8 +129,7 @@ public class tarParser extends AbstractParser implements Parser {
break; break;
} }
} }
if (docacc.isEmpty()) return null; return new Document[]{maindoc};
return docacc.toArray(new Document[docacc.size()]);
} }
public final static boolean isTar(File f) { public final static boolean isTar(File f) {

@ -27,8 +27,7 @@ package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.ArrayList; import java.util.Date;
import java.util.List;
import java.util.zip.ZipEntry; import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream; import java.util.zip.ZipInputStream;
@ -43,7 +42,11 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
// this is a new implementation of this parser idiom using multiple documents as result set // this is a new implementation of this parser idiom using multiple documents as result set
/**
* Parses Zip archives. Creates a main document for the zip url/file.
* Each file in the zip is parsed and the result added to the main document.
* parse returns one document with the combined content.
*/
public class zipParser extends AbstractParser implements Parser { public class zipParser extends AbstractParser implements Parser {
public zipParser() { public zipParser() {
@ -74,15 +77,33 @@ public class zipParser extends AbstractParser implements Parser {
if (!MemoryControl.request(200 * 1024 * 1024, false)) if (!MemoryControl.request(200 * 1024 * 1024, false))
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location); throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), location);
Document[] docs = null;
final List<Document> docacc = new ArrayList<Document>();
ZipEntry entry; ZipEntry entry;
final ZipInputStream zis = new ZipInputStream(source); final ZipInputStream zis = new ZipInputStream(source);
File tmp = null; // create maindoc for this zip container with supplied url and mime
Document maindoc = new Document(
location,
mimeType,
charset,
this,
null,
null,
null,
null,
null,
null,
null,
0.0d, 0.0d,
(Object)null,
null,
null,
null,
false,
new Date());
// loop through the elements in the zip file and parse every single file inside // loop through the elements in the zip file and parse every single file inside
while (true) { while (true) {
try { try {
File tmp = null;
if (zis.available() <= 0) break; if (zis.available() <= 0) break;
entry = zis.getNextEntry(); entry = zis.getNextEntry();
if (entry == null) break; if (entry == null) break;
@ -95,9 +116,9 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final DigestURL virtualURL = DigestURL.newURL(location, "#" + name); final DigestURL virtualURL = DigestURL.newURL(location, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp); final Document[] docs = TextParser.parseSource(new AnchorURL(virtualURL), mime, null, scraper, timezoneOffset, 999, tmp);
if (docs == null) continue; if (docs == null) continue;
for (final Document d: docs) docacc.add(d); maindoc.addSubDocuments(docs);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage()); AbstractParser.log.warn("ZIP parser entry " + name + ": " + e.getMessage());
} finally { } finally {
@ -108,7 +129,6 @@ public class zipParser extends AbstractParser implements Parser {
break; break;
} }
} }
if (docacc.isEmpty()) return null; return new Document[]{maindoc};
return docacc.toArray(new Document[docacc.size()]);
} }
} }

Loading…
Cancel
Save