yacy_search_server/test/net/yacy/document/ParserTest.java

package net.yacy.document;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.odtParser;
import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.pptParser;
import static org.hamcrest.CoreMatchers.containsString;
import static org.junit.Assert.assertThat;
import org.junit.Test;


public class ParserTest {

	@Test public void testooxmlParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException	{
		final String[][] testFiles = new String[][] {
			// meaning:  filename in test/parsertest, mimetype, title, creator, description,
			new String[]{"umlaute_windows.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
			new String[]{"umlaute_windows.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", "Folie 1", "", ""},
		};

		for (final String[] testFile : testFiles) {
                    try {
                        final String filename = "test/parsertest/" + testFile[0];
                        final File file = new File(filename);
                        final String mimetype = testFile[1];
                        final AnchorURL url = new AnchorURL("http://localhost/"+filename);

                        AbstractParser p = new ooxmlParser();
                        final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
                        for (final Document doc: docs) {
                            final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
                            final StringBuilder str = new StringBuilder();
                            int c;
                            while( (c = content.read()) != -1 )
                                    str.append((char)c);

                            System.out.println("Parsed " + filename + ": " + str);
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
		}

        	@Test public void testodtParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException	{
		final String[][] testFiles = new String[][] {
			// meaning:  filename in test/parsertest, mimetype, title, creator, description,
			new String[]{"umlaute_linux.odt", "application/vnd.oasis.opendocument.text", "Münchner Hofbräuhaus", "", "Kommentar zum Hofbräuhaus"},
			new String[]{"umlaute_linux.ods", "application/vnd.oasis.opendocument.spreadsheat", "", "", ""},
			new String[]{"umlaute_linux.odp", "application/vnd.oasis.opendocument.presentation", "", "", ""},
		};

		for (final String[] testFile : testFiles) {
                    try {
                        final String filename = "test/parsertest/" + testFile[0];
                        final File file = new File(filename);
                        final String mimetype = testFile[1];
                        final AnchorURL url = new AnchorURL("http://localhost/"+filename);

                        AbstractParser p = new odtParser();
                        final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
                        for (final Document doc: docs) {
                            final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
                            final StringBuilder str = new StringBuilder();
                            int c;
                            while( (c = content.read()) != -1 )
                                    str.append((char)c);

                            System.out.println("Parsed " + filename + ": " + str);
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
		}

        	@Test public void testpdfParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException	{
		final String[][] testFiles = new String[][] {
                        // meaning:  filename in test/parsertest, mimetype, title, creator, description,
			new String[]{"umlaute_linux.pdf", "application/pdf", "", "", ""},
		};

		for (final String[] testFile : testFiles) {
                    try {
                        final String filename = "test/parsertest/" + testFile[0];
                        final File file = new File(filename);
                        final String mimetype = testFile[1];
                        final AnchorURL url = new AnchorURL("http://localhost/"+filename);

                        AbstractParser p = new pdfParser();
                        final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
                        for (final Document doc: docs) {
                            final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
                            final StringBuilder str = new StringBuilder();
                            int c;
                            while( (c = content.read()) != -1 )
                                    str.append((char)c);

                            System.out.println("Parsed " + filename + ": " + str);
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
		}

        	@Test public void testdocParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException	{
		final String[][] testFiles = new String[][] {
			// meaning:  filename in test/parsertest, mimetype, title, creator, description,
			new String[]{"umlaute_windows.doc", "application/msword", "", "", ""},
		};

		for (final String[] testFile : testFiles) {
                    try {
                        final String filename = "test/parsertest/" + testFile[0];
                        final File file = new File(filename);
                        final String mimetype = testFile[1];
                        final AnchorURL url = new AnchorURL("http://localhost/"+filename);

                        AbstractParser p = new docParser();
                        final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
                        for (final Document doc: docs) {
                            final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
                            final StringBuilder str = new StringBuilder();
                            int c;
                            while( (c = content.read()) != -1 )
                                    str.append((char)c);

                            System.out.println("Parsed " + filename + ": " + str);
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
		}

    /**
     * Powerpoint parser test *
     */
    @Test
    public void testpptParsers() throws FileNotFoundException, Parser.Failure, MalformedURLException, UnsupportedEncodingException, IOException, InterruptedException {
        final String[][] testFiles = new String[][]{
            // meaning:  filename in test/parsertest, mimetype, title, creator, description,
            new String[]{"umlaute_linux.ppt", "application/powerpoint", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
            new String[]{"umlaute_windows.ppt", "application/powerpoint", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "afieg", ""},
            new String[]{"umlaute_mac.ppt", "application/powerpoint", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "Bob", ""}
        };

        for (final String[] testFile : testFiles) {

            final String filename = "test/parsertest/" + testFile[0];
            final File file = new File(filename);
            final String mimetype = testFile[1];
            final AnchorURL url = new AnchorURL("http://localhost/" + filename);

            AbstractParser p = new pptParser();
            final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
            for (final Document doc : docs) {
                final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
                final StringBuilder str = new StringBuilder();
                int c;
                while ((c = content.read()) != -1) {
                    str.append((char) c);
                }

                System.out.println("Parsed " + filename + ": " + str);
                assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                assertThat(doc.dc_title(), containsString(testFile[2]));
                assertThat(doc.dc_creator(), containsString(testFile[3]));
                if (testFile[4].length() > 0) {
                    assertThat(doc.dc_description()[0], containsString(testFile[4]));
                }
            }
        }
    }
	}