From 97ab5b90e8e62a4a99e8a1849e13ef76ef3aa328 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 20 May 2013 01:50:09 +0200 Subject: [PATCH] - odt & ooxml (office document) parser correction to add content to fulltext index - adjust Junit yacyVersionTest & ParserTest - update yacyVersion.combined2prettyVersion to the default 4-digit minor ver. --- .../net/yacy/document/parser/odtParser.java | 22 ++- .../net/yacy/document/parser/ooxmlParser.java | 24 ++- .../net/yacy/peers/operation/yacyVersion.java | 2 +- test/de/anomic/document/ParserTest.java | 148 +++++++++++++++--- test/de/anomic/yacy/yacyVersionTest.java | 41 ++--- 5 files changed, 166 insertions(+), 71 deletions(-) diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 670f68580..ba2a53a68 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -131,20 +131,18 @@ public class odtParser extends AbstractParser implements Parser { if (entryName.equals("content.xml")) { // create a writer for output - writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize()); + writer = new CharBuffer(MAX_DOCSIZE, (int) zipEntry.getSize()); + + // extract data + final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); try { - // extract data - final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - try { - final SAXParser saxParser = getParser(); - saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); - } finally { - // close readers and writers - zipFileEntryStream.close(); - } + final SAXParser saxParser = getParser(); + saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); } finally { - writer.close(); + // close readers and writers + zipFileEntryStream.close(); } + } else if (entryName.equals("meta.xml")) { // meta.xml contains metadata about the document final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); @@ -177,7 +175,7 @@ public class odtParser extends AbstractParser implements Parser { // create the parser document Document[] docs = null; - final byte[] contentBytes = UTF8.getBytes(writer.toString()); + final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString()); docs = new Document[]{new Document( location, mimeType, diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 7021b6ac4..9e579351b 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -116,21 +116,19 @@ public class ooxmlParser extends AbstractParser implements Parser { || entryName.startsWith("xl/worksheets/sheet")) { // create a writer for output - writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize()); + writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize()); + + // extract data + final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); try { - // extract data - final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - try { - final SAXParser saxParser = getParser(); - saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); - - // close readers and writers - } finally { - zipFileEntryStream.close(); - } + final SAXParser saxParser = getParser(); + saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); + + // close readers and writers } finally { - writer.close(); + zipFileEntryStream.close(); } + } else if (entryName.equals("docProps/core.xml")) { // meta.xml contains metadata about the document final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); @@ -162,7 +160,7 @@ public class ooxmlParser extends AbstractParser implements Parser { // create the parser document Document[] docs = null; - final byte[] contentBytes = UTF8.getBytes(writer.toString()); + final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString()); docs = new Document[]{new Document( location, mimeType, diff --git a/source/net/yacy/peers/operation/yacyVersion.java b/source/net/yacy/peers/operation/yacyVersion.java index 4b1f3bf88..85beb0ef7 100644 --- a/source/net/yacy/peers/operation/yacyVersion.java +++ b/source/net/yacy/peers/operation/yacyVersion.java @@ -154,7 +154,7 @@ public class yacyVersion implements Comparator, Comparable