- odt & ooxml (office document) parser correction to add content to fulltext index

- adjust Junit yacyVersionTest & ParserTest 
- update yacyVersion.combined2prettyVersion to the default 4-digit minor ver.
pull/1/head
reger 12 years ago
parent b68fbe7d21
commit 97ab5b90e8

@ -131,20 +131,18 @@ public class odtParser extends AbstractParser implements Parser {
if (entryName.equals("content.xml")) {
// create a writer for output
writer = new CharBuffer(MAX_DOCSIZE, (int)zipEntry.getSize());
writer = new CharBuffer(MAX_DOCSIZE, (int) zipEntry.getSize());
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
// close readers and writers
zipFileEntryStream.close();
}
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
writer.close();
// close readers and writers
zipFileEntryStream.close();
}
} else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -177,7 +175,7 @@ public class odtParser extends AbstractParser implements Parser {
// create the parser document
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
docs = new Document[]{new Document(
location,
mimeType,

@ -116,21 +116,19 @@ public class ooxmlParser extends AbstractParser implements Parser {
|| entryName.startsWith("xl/worksheets/sheet")) {
// create a writer for output
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int)zipEntry.getSize());
writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize());
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
} finally {
zipFileEntryStream.close();
}
final SAXParser saxParser = getParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
} finally {
writer.close();
zipFileEntryStream.close();
}
} else if (entryName.equals("docProps/core.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
@ -162,7 +160,7 @@ public class ooxmlParser extends AbstractParser implements Parser {
// create the parser document
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
final byte[] contentBytes = (writer == null) ? null : UTF8.getBytes(writer.toString());
docs = new Document[]{new Document(
location,
mimeType,

@ -154,7 +154,7 @@ public class yacyVersion implements Comparator<yacyVersion>, Comparable<yacyVers
final String mainversion = (Double.parseDouble(matcher.group(1)) < 0.11 ? "dev" : matcher.group(1));
String revision = matcher.group(2);
for(int i=revision.length();i<5;++i) revision += "0";
for(int i=revision.length();i<4;++i) revision += "0";
return new String[]{mainversion, revision};
}

@ -11,10 +11,14 @@ import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.odtParser;
import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.junit.Test;
@ -22,40 +26,134 @@ import org.junit.Test;
public class ParserTest {
@Test public void testParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
@Test public void testooxmlParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new ooxmlParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testodtParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new odtParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
// assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testpdfParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
};
for (final String[] testFile : testFiles) {
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new pdfParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
@Test public void testdocParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException {
final String[][] testFiles = new String[][] {
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
};
for (final String[] testFile : testFiles) {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
final Document[] docs = TextParser.parseSource(url, mimetype, null, file.length(), new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
}
}
}
try {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final DigestURI url = new DigestURI("http://localhost/"+filename);
AbstractParser p = new docParser();
final Document[] docs = p.parse(url, mimetype, null, new FileInputStream(file));
for (final Document doc: docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while( (c = content.read()) != -1 )
str.append((char)c);
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description(), containsString(testFile[4]));
}
} catch (InterruptedException ex) {}
}
}
}

@ -2,6 +2,7 @@ package de.anomic.yacy;
import net.yacy.peers.operation.yacyVersion;
import junit.framework.TestCase;
import org.junit.Assert;
public class yacyVersionTest extends TestCase {
@ -10,26 +11,26 @@ public class yacyVersionTest extends TestCase {
* @author Bost
*/
public void testCombinedVersionString2PrettyString() {
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("")); // not a number
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" ")); // not a number
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417"));
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1000244"));
assertEquals("dev/02417", yacyVersion.combined2prettyVersion("0.10002417"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.100024400")); // input is too long
assertEquals("dev/02440", yacyVersion.combined2prettyVersion("0.1090244"));
assertEquals("0.110/02440", yacyVersion.combined2prettyVersion("0.1100244"));
assertEquals("0.111/02440", yacyVersion.combined2prettyVersion("0.1110244"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated
assertEquals("dev/00000", yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed
assertEquals("0.111/00000", yacyVersion.combined2prettyVersion("0.111")); // was (input is too short)
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("123456789x")); // not a number
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point
assertEquals("999.999/99900", yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 5 digits)
assertEquals("0.999/99999", yacyVersion.combined2prettyVersion("0.99999999"));
assertEquals("99999.004/56789", yacyVersion.combined2prettyVersion("99999.00456789"));
assertEquals("dev/00000", yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("")); // not a number
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" ")); // not a number
Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1000244"));
Assert.assertArrayEquals(new String[]{"dev","02417"}, yacyVersion.combined2prettyVersion("0.10002417"));
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.100024400")); // input is too long
Assert.assertArrayEquals(new String[]{"dev","0244"}, yacyVersion.combined2prettyVersion("0.1090244"));
Assert.assertArrayEquals(new String[]{"0.110","0244"}, yacyVersion.combined2prettyVersion("0.1100244"));
Assert.assertArrayEquals(new String[]{"0.111","0244"}, yacyVersion.combined2prettyVersion("0.1110244"));
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.0")); // input is valid - no warning generated
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion(" 0.11102440")); // spaces are not allowed
Assert.assertArrayEquals(new String[]{"0.111","0000"}, yacyVersion.combined2prettyVersion("0.111")); // was (input is too short)
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("0.1112440\t\n")); // \t and \n are not allowed
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("124353432xxxx4546399999")); // not a number + too long
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("123456789x")); // not a number
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("9999999999")); // missing decimal point
Assert.assertArrayEquals(new String[]{"999.999","9990"}, yacyVersion.combined2prettyVersion("999.999999")); // was (floating point part must have 3 and SVN-Version 4 digits)
Assert.assertArrayEquals(new String[]{"0.999","99999"}, yacyVersion.combined2prettyVersion("0.99999999"));
Assert.assertArrayEquals(new String[]{"99999.004","56789"}, yacyVersion.combined2prettyVersion("99999.00456789"));
Assert.assertArrayEquals(new String[]{"dev","0000"}, yacyVersion.combined2prettyVersion("99999.003456789")); // input is too long
}
}

Loading…
Cancel
Save