diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 8de8bf6e1..8b92df0ba 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -44,6 +44,7 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; @@ -52,6 +53,8 @@ import net.yacy.document.Parser; import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.xml.ODContentHandler; import net.yacy.document.parser.xml.ODMetaHandler; +import net.yacy.document.parser.xml.OOXMLSharedStringsHandler; +import net.yacy.document.parser.xml.OOXMLSpreeadsheetHandler; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; @@ -108,21 +111,53 @@ public class ooxmlParser extends AbstractParser implements Parser { // opening the file as zip file final ZipFile zipFile= new ZipFile(dest); final Enumeration<? extends ZipEntry> zipEnum = zipFile.entries(); + + final Set<AnchorURL> detectedURLs = new HashSet<>(); + /* Handle first any eventual spreadsheet shared strings table + * As stated by the standard : "A package shall contain exactly one Shared String Table part," */ + final ZipEntry sharedStringsEntry = zipFile.getEntry(OOXMLSharedStringsHandler.ENTRY_NAME); + final List<String> sharedStrings = new ArrayList<>(); + + if (sharedStringsEntry != null) { + // extract data + try (final InputStream zipFileEntryStream = zipFile.getInputStream(sharedStringsEntry)) { + final SAXParser saxParser = getParser(); + saxParser.parse(zipFileEntryStream, new OOXMLSharedStringsHandler(sharedStrings)); + } + } + + // looping through all containing files while (zipEnum.hasMoreElements()) { - // get next zip file entry final ZipEntry zipEntry= zipEnum.nextElement(); final String entryName = zipEntry.getName(); + + if(entryName.startsWith("xl/worksheets/sheet")) { + if(writer == null) { + // create a writer for output + writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize()); + } + + // extract data + final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); + try { + final SAXParser saxParser = getParser(); + saxParser.parse(zipFileEntryStream, new OOXMLSpreeadsheetHandler(sharedStrings, writer, detectedURLs)); - // content.xml contains the document content in xml format - if (entryName.equals("word/document.xml") - || entryName.startsWith("ppt/slides/slide") - || entryName.startsWith("xl/worksheets/sheet")) { + // close readers and writers + } finally { + zipFileEntryStream.close(); + } + + } else if (entryName.equals("word/document.xml") + || entryName.startsWith("ppt/slides/slide")) { - // create a writer for output - writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize()); + if(writer == null) { + // create a writer for output + writer = new CharBuffer(odtParser.MAX_DOCSIZE, (int) zipEntry.getSize()); + } // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); @@ -186,7 +221,7 @@ public class ooxmlParser extends AbstractParser implements Parser { descriptions, 0.0d, 0.0d, contentBytes, - null, + detectedURLs, null, null, false, diff --git a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java index e9154457a..260abcfde 100644 --- a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java +++ b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java @@ -35,7 +35,6 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import net.yacy.cora.document.id.AnchorURL; -import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.parser.html.ContentScraper; /** @@ -150,7 +149,7 @@ public class GenericXMLContentHandler extends DefaultHandler { public void characters(final char ch[], final int start, final int length) throws SAXException { try { if(this.currentElementTextChunks == 0 && this.documentHasText) { - /* We are but on the first text chunk of the element (not on the first text chunk of the whole document), + /* We are on the first text chunk of the element (not on the first text chunk of the whole document), * or on the first text chunk after processing nested elements : * if necessary we add a space to separate text content of different elements */ if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) { @@ -167,14 +166,14 @@ public class GenericXMLContentHandler extends DefaultHandler { this.documentHasText = true; this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]); } - } catch (final IOException ignored) { - ConcurrentLog.logException(ignored); + } catch (final IOException ioe) { + throw new SAXException("Error while appending characters to the output writer", ioe); } } /** - * When the eventual element text doesn't end with a terminal punctuation character, - * add a period ('.' character) to help future SentenceReader work. + * Perform URLs detection on the ending element text + * @throws SAXException when whe maxURLs limit has been reached */ @Override public void endElement(String uri, String localName, String qName) throws SAXException { diff --git a/source/net/yacy/document/parser/xml/OOXMLSharedStringsHandler.java b/source/net/yacy/document/parser/xml/OOXMLSharedStringsHandler.java new file mode 100644 index 000000000..c67edf462 --- /dev/null +++ b/source/net/yacy/document/parser/xml/OOXMLSharedStringsHandler.java @@ -0,0 +1,120 @@ +// OOXMLSharedStringsHandler.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser.xml; + +import java.io.IOException; +import java.util.List; + +import org.apache.commons.io.input.ClosedInputStream; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX handler for Office Open XML SpreadSheet xl/sharedStrings.xml files. + * + * @author luccioman + * @see <a href= + * "http://www.ecma-international.org/publications/standards/Ecma-376.htm">Ecma + * Standard for Office Open XML File Formats</a> + * + */ +public class OOXMLSharedStringsHandler extends DefaultHandler { + + /** The entry name of a shared strings table in an OOXML container */ + public static final String ENTRY_NAME = "xl/sharedStrings.xml"; + + /** Name of a shared string tag */ + private static final String SHARED_STRING_TAG = "t"; + + /** Shared strings list */ + private final List<String> sharedStrings; + + /** Currently parsed string builder. */ + private StringBuilder currentString; + + /** + * @param sharedStrings + * the mutable list of shared strings to fill + * @throws IllegalArgumentException + * when a parameter is null + */ + public OOXMLSharedStringsHandler(final List<String> sharedStrings) throws IllegalArgumentException { + if (sharedStrings == null) { + throw new IllegalArgumentException("sharedStrings list must not be null"); + } + this.sharedStrings = sharedStrings; + } + + /** + * @return an empty source to prevent the SAX parser opening an unwanted + * connection to resolve an external entity + */ + @Override + public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { + return new InputSource(new ClosedInputStream()); + } + + @Override + public void startDocument() throws SAXException { + this.currentString = new StringBuilder(); + } + + @Override + public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) + throws SAXException { + if (SHARED_STRING_TAG.equals(qName)) { + this.currentString.setLength(0); + } + } + + /** + * Append characters to the current string builder. May be called multiple times + * before obtaining the whole current element string. + */ + @Override + public void characters(final char ch[], final int start, final int length) throws SAXException { + this.currentString.append(ch, start, length); + } + + /** + * Add the current string content to the list when ending a shared string + * element + */ + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (SHARED_STRING_TAG.equals(qName)) { + final String sharedString = this.currentString.toString(); + this.sharedStrings.add(sharedString); + this.currentString.setLength(0); + } + } + + @Override + public void endDocument() throws SAXException { + /* Release the StringBuilder now useless */ + this.currentString = null; + } + +} \ No newline at end of file diff --git a/source/net/yacy/document/parser/xml/OOXMLSpreeadsheetHandler.java b/source/net/yacy/document/parser/xml/OOXMLSpreeadsheetHandler.java new file mode 100644 index 000000000..774277801 --- /dev/null +++ b/source/net/yacy/document/parser/xml/OOXMLSpreeadsheetHandler.java @@ -0,0 +1,257 @@ +// OOXMLSpreeadsheetHandler.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser.xml; + +import java.io.IOException; +import java.io.Writer; +import java.util.Collection; +import java.util.List; + +import javax.naming.SizeLimitExceededException; + +import org.apache.commons.io.input.ClosedInputStream; +import org.xml.sax.Attributes; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.document.parser.html.ContentScraper; + +/** + * SAX handler for Office Open XML SpreadSheet xl/sharedStrings.xml files. + * + * @author luccioman + * @see <a href= + * "http://www.ecma-international.org/publications/standards/Ecma-376.htm">Ecma + * Standard for Office Open XML File Formats</a> + * + */ +public class OOXMLSpreeadsheetHandler extends DefaultHandler { + + /** The entry name prefix of a data sheets in an OOXML container */ + public static final String ENTRY_PREFIX = "xl/worksheets/sheet"; + + /** Name of a cell tag in a data sheet */ + private static final String CELL_TAG = "c"; + + /** Attribute name indicating the type of a cell element in a data sheet */ + private static final String CELL_TYPE_ATTRIBUTE = "t"; + + /** Name of a cell value tag in a data sheet */ + private static final String CELL_VALUE_TAG = "v"; + + /** + * Cell type attribute value for a cell using a shared string + * + * @see "'Ecma Office Open XML Part 1 - Fundamentals And Markup Language Reference.pdf' - section 18.18.11 ST_CellType (Cell Type)" + */ + private static final String SHARED_STRING_CELL_TYPE = "s"; + + /** The document shared strings list */ + private final List<String> sharedStrings; + + /** Output writer for cells text */ + private final Writer out; + + /** Detected URLs */ + private final Collection<AnchorURL> urls; + + /** Maximum number of URLs to parse */ + private final int maxURLs; + + /** Number of parsed URLs in the data sheet */ + private long detectedURLs; + + /** + * Set to true when the last character written to the output writer is a space + */ + private boolean lastAppendedIsSpace; + + /** Currently parsed cell value content. */ + private StringBuilder cellValue; + + /** Set to true when we are currently processing a XMl cell element */ + private boolean inCell; + + /** Set to true when we are currently processing a XML cell value element */ + private boolean inCellValue; + + /** + * Set to true when we are currently processing a XML cell element of Shared + * String type + */ + private boolean sharedStringCell; + + /** + * @param sharedStrings + * the list of shared strings of the parent spredsheet document + * @param out + * the output writer to write text extracted from cells. Must not be + * null. + * @param urls + * the mutable collection of URLs to fill with detected URLs + * @throws IllegalArgumentException + * when out is null + */ + public OOXMLSpreeadsheetHandler(final List<String> sharedStrings, final Writer out, + final Collection<AnchorURL> urls) throws IllegalArgumentException { + this(sharedStrings, out, urls, Integer.MAX_VALUE); + } + + /** + * @param out + * the output writer to write extracted text. Must not be null. + * @param urls + * the mutable collection of URLs to fill with detected URLs + * @param maxURLs + * the maximum number of urls to parse + * @throws IllegalArgumentException + * when out or urls parmeter is null + */ + public OOXMLSpreeadsheetHandler(final List<String> sharedStrings, final Writer out, + final Collection<AnchorURL> urls, final int maxURLs) throws IllegalArgumentException { + if (out == null) { + throw new IllegalArgumentException("out writer must not be null"); + } + if (urls == null) { + throw new IllegalArgumentException("urls collection must not be null"); + } + this.sharedStrings = sharedStrings; + this.out = out; + this.urls = urls; + this.maxURLs = maxURLs; + this.detectedURLs = 0; + this.lastAppendedIsSpace = false; + this.inCell = false; + this.inCellValue = false; + this.sharedStringCell = false; + } + + /** + * @return an empty source to prevent the SAX parser opening an unwanted + * connection to resolve an external entity + */ + @Override + public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { + return new InputSource(new ClosedInputStream()); + } + + @Override + public void startDocument() throws SAXException { + this.cellValue = new StringBuilder(); + this.detectedURLs = 0; + this.lastAppendedIsSpace = false; + this.inCell = false; + this.inCellValue = false; + this.sharedStringCell = false; + } + + @Override + public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) + throws SAXException { + if (CELL_TAG.equals(qName)) { + this.cellValue.setLength(0); + this.inCell = true; + final String cellType = attributes.getValue(CELL_TYPE_ATTRIBUTE); + this.sharedStringCell = SHARED_STRING_CELL_TYPE.equals(cellType); + } else if (this.inCell && CELL_VALUE_TAG.equals(qName)) { + this.cellValue.setLength(0); + this.inCellValue = true; + } + } + + /** + * Append characters to the current string builder. May be called multiple times + * before obtaining the whole current element string. + */ + @Override + public void characters(final char ch[], final int start, final int length) throws SAXException { + if (this.inCellValue) { + this.cellValue.append(ch, start, length); + } + } + + /** + * Perform URLs detection on the ending element text + * + * @throws SAXException + * when the maxURLs limit has been reached + */ + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (CELL_VALUE_TAG.equals(qName)) { + String cellText = null; + if (this.sharedStringCell) { + /* Try to retrieve the cell text from the shared strings list */ + try { + int index = Integer.parseInt(this.cellValue.toString()); + if (this.sharedStrings != null && this.sharedStrings.size() > index) { + cellText = this.sharedStrings.get(index); + } + } catch (NumberFormatException ignored) { + /* Do not terminate parsing if one shared strings index value is malformed */ + } + } else { + /* Use directly the cell value as text */ + cellText = this.cellValue.toString(); + } + try { + if (cellText != null && !cellText.isEmpty()) { + this.detectedURLs += ContentScraper.findAbsoluteURLs(cellText, this.urls, null, + this.maxURLs - this.detectedURLs); + + /* + * Iif necessary we add a space to separate text content of different elements + */ + if (!this.lastAppendedIsSpace && !Character.isWhitespace(cellText.charAt(0))) { + this.out.write(" "); + } + + this.out.write(cellText); + this.lastAppendedIsSpace = Character.isWhitespace(cellText.charAt(cellText.length() - 1)); + } + } catch (IOException ioe) { + throw new SAXException("Error while appending characters to the output writer", ioe); + } finally { + this.cellValue.setLength(0); + this.inCellValue = false; + } + + if (this.detectedURLs >= this.maxURLs) { + throw new SAXException( + new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs)); + } + } else if (CELL_TAG.equals(qName)) { + this.inCell = false; + this.inCellValue = false; + } + } + + @Override + public void endDocument() throws SAXException { + /* Release the StringBuilder now useless */ + this.cellValue = null; + } + +} \ No newline at end of file diff --git a/test/java/net/yacy/document/ParserTest.java b/test/java/net/yacy/document/ParserTest.java index 9c2e40879..c78ca12f9 100644 --- a/test/java/net/yacy/document/ParserTest.java +++ b/test/java/net/yacy/document/ParserTest.java @@ -11,7 +11,6 @@ import java.net.MalformedURLException; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.parser.docParser; import net.yacy.document.parser.odtParser; -import net.yacy.document.parser.ooxmlParser; import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pptParser; import static org.hamcrest.CoreMatchers.containsString; @@ -21,62 +20,6 @@ import org.junit.Test; public class ParserTest { - @Test public void testooxmlParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException { - final String[][] testFiles = new String[][] { - // meaning: filename in test/parsertest, mimetype, title, creator, description, - new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""}, - new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""}, - }; - - for (final String[] testFile : testFiles) { - FileInputStream inStream = null; - final String filename = "test/parsertest/" + testFile[0]; - try { - final File file = new File(filename); - final String mimetype = testFile[1]; - final AnchorURL url = new AnchorURL("http://localhost/"+filename); - - AbstractParser p = new ooxmlParser(); - inStream = new FileInputStream(file); - final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, inStream); - for (final Document doc: docs) { - Reader content = null; - try { - content = new InputStreamReader(doc.getTextStream(), doc.getCharset()); - final StringBuilder str = new StringBuilder(); - int c; - while( (c = content.read()) != -1 ) - str.append((char)c); - - System.out.println("Parsed " + filename + ": " + str); - assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); - assertThat(doc.dc_title(), containsString(testFile[2])); - assertThat(doc.dc_creator(), containsString(testFile[3])); - if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4])); - } finally { - if(content != null) { - try { - content.close(); - } catch(IOException ioe) { - System.out.println("Could not close text input stream"); - } - } - } - } - } catch (final InterruptedException ex) { - - } finally { - if(inStream != null) { - try { - inStream.close(); - } catch(IOException ioe) { - System.out.println("Could not close input stream on file " + filename); - } - } - } - } - } - @Test public void testodtParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException { final String[][] testFiles = new String[][] { // meaning: filename in test/parsertest, mimetype, title, creator, description, diff --git a/test/java/net/yacy/document/parser/ooxmlParserTest.java b/test/java/net/yacy/document/parser/ooxmlParserTest.java new file mode 100644 index 000000000..1839f2fc9 --- /dev/null +++ b/test/java/net/yacy/document/parser/ooxmlParserTest.java @@ -0,0 +1,160 @@ +// ooxmlParserTest.java +// --------------------------- +// Copyright 2017 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.document.parser; + +import static org.hamcrest.CoreMatchers.containsString; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.Collection; + +import org.junit.Test; + +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.VocabularyScraper; + +/** + * Unit tests for the {@link ooxmlParser} class + * + * @author luccioman + * + */ +public class ooxmlParserTest { + + /** + * Unit test for the ooxmlParser.parse() function with some small tests + * documents. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParse() throws Exception { + final String[][] testFiles = new String[][] { + // meaning: filename in test/parsertest, mimetype, title, creator, description + new String[] { "umlaute_windows.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", "" }, + new String[] { "umlaute_mac.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "", "", "" }, + new String[] { "umlaute_windows.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", + "" }, + new String[] { "umlaute_mac.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Slide 1", "", + "" }, + new String[] { "umlaute_linux.ppsx", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + "Office Open XML test slideshow from LibreOffice on Linux", "", "" }, + new String[] { "umlaute_mac.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "", "", "" }, + new String[] { "umlaute_windows.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "", "", "" }, + new String[] { "umlaute_linux.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "Office Open XML Spreadsheet test document from LibreOffice Calc on Linux", "", + "Test spreadsheet document for YaCy ooxml parser" } }; + + for (final String[] testFile : testFiles) { + FileInputStream inStream = null; + final String filename = testFile[0]; + try { + final File file = new File("test" + File.separator + "parsertest" + File.separator + filename); + final String mimetype = testFile[1]; + final AnchorURL url = new AnchorURL("http://localhost/" + filename); + + AbstractParser p = new ooxmlParser(); + inStream = new FileInputStream(file); + final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, inStream); + for (final Document doc : docs) { + Reader content = null; + try { + content = new InputStreamReader(doc.getTextStream(), doc.getCharset()); + final StringBuilder str = new StringBuilder(); + int c; + while ((c = content.read()) != -1) + str.append((char) c); + + System.out.println("Parsed " + filename + ": " + str); + assertThat(str.toString(), + containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen")); + assertThat(doc.dc_title(), containsString(testFile[2])); + assertThat(doc.dc_creator(), containsString(testFile[3])); + if (testFile[4].length() > 0) + assertThat(doc.dc_description()[0], containsString(testFile[4])); + } finally { + if (content != null) { + try { + content.close(); + } catch (IOException ioe) { + System.out.println("Could not close text input stream"); + } + } + } + } + } finally { + if (inStream != null) { + try { + inStream.close(); + } catch (IOException ioe) { + System.out.println("Could not close input stream on file " + filename); + } + } + } + } + } + + /** + * Test URLs detection on the ooxmlParser.parse() function. + * @throws Exception when an unexpected error occurred + */ + @Test + public void testParseURLs() throws Exception { + final String fileName = "umlaute_linux.xlsx"; + final File file = new File("test" + File.separator + "parsertest" + File.separator + fileName); + final String mimetype = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"; + final AnchorURL url = new AnchorURL("http://localhost/" + fileName); + + AbstractParser p = new ooxmlParser(); + try(InputStream inStream = new FileInputStream(file);) { + final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, inStream); + assertNotNull("Documents result must not be null", docs); + final Collection<AnchorURL> anchors = docs[0].getAnchors(); + assertNotNull("Detected URLs must not be null", anchors); + assertEquals("2 URLs should be detected", 2, anchors.size()); + assertTrue("YaCy home page URL should have been parsed", anchors.contains(new AnchorURL("http://yacy.net"))); + assertTrue("YaCy forum URL should have been parsed", anchors.contains(new AnchorURL("http://forum.yacy-websuche.de/"))); + } + } + +} diff --git a/test/parsertest/umlaute_linux.ppsx b/test/parsertest/umlaute_linux.ppsx new file mode 100644 index 000000000..16d253a65 Binary files /dev/null and b/test/parsertest/umlaute_linux.ppsx differ diff --git a/test/parsertest/umlaute_linux.xlsx b/test/parsertest/umlaute_linux.xlsx new file mode 100644 index 000000000..909a6a213 Binary files /dev/null and b/test/parsertest/umlaute_linux.xlsx differ