From 936e976c23634e164274ebae1d87a42b08005b13 Mon Sep 17 00:00:00 2001 From: low012 Date: Mon, 27 Dec 2010 20:13:31 +0000 Subject: [PATCH] *) added FreeMind (http://freemind.sourceforge.net/) mindmap parser *) minor changes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7397 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/httpd.mime | 1 + source/net/yacy/document/Classification.java | 10 +- source/net/yacy/document/TextParser.java | 4 +- source/net/yacy/document/parser/mmParser.java | 122 ++++++++++++++++++ .../net/yacy/document/parser/swfParser.java | 5 +- 5 files changed, 135 insertions(+), 7 deletions(-) create mode 100644 source/net/yacy/document/parser/mmParser.java diff --git a/defaults/httpd.mime b/defaults/httpd.mime index cd4cfb67e..c90fde148 100644 --- a/defaults/httpd.mime +++ b/defaults/httpd.mime @@ -48,6 +48,7 @@ lzh = application/x-lzh m4v = video/x-m4v mf = application/octet-stream mov = video/quicktime +mm = application/freemind mp2 = audio/mpeg mp3 = audio/mpeg mp4 = video/mp4 diff --git a/source/net/yacy/document/Classification.java b/source/net/yacy/document/Classification.java index 9f0679942..65118f692 100644 --- a/source/net/yacy/document/Classification.java +++ b/source/net/yacy/document/Classification.java @@ -32,11 +32,11 @@ import net.yacy.kelondro.logging.Log; public class Classification { - private static final HashSet mediaExtSet = new HashSet(); - private static final HashSet imageExtSet = new HashSet(); - private static final HashSet audioExtSet = new HashSet(); - private static final HashSet videoExtSet = new HashSet(); - private static final HashSet appsExtSet = new HashSet(); + private static final Set mediaExtSet = new HashSet(); + private static final Set imageExtSet = new HashSet(); + private static final Set audioExtSet = new HashSet(); + private static final Set videoExtSet = new HashSet(); + private static final Set appsExtSet = new HashSet(); private static final Properties ext2mime = new Properties(); diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 2f20a4db1..269f0eddc 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -57,6 +57,7 @@ import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; import net.yacy.document.parser.images.genericImageParser; +import net.yacy.document.parser.mmParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -79,6 +80,7 @@ public final class TextParser { initParser(new gzipParser()); initParser(new htmlParser()); initParser(new genericImageParser()); + initParser(new mmParser()); initParser(new odtParser()); initParser(new ooxmlParser()); initParser(new pdfParser()); @@ -263,7 +265,7 @@ public final class TextParser { } if (docs == null) { - if (failedParser.size() == 0) { + if (failedParser.isEmpty()) { final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; //log.logWarning("Unable to parse '" + location + "'. " + errorMsg); throw new Parser.Failure(errorMsg, location); diff --git a/source/net/yacy/document/parser/mmParser.java b/source/net/yacy/document/parser/mmParser.java new file mode 100644 index 000000000..ba0e2bb33 --- /dev/null +++ b/source/net/yacy/document/parser/mmParser.java @@ -0,0 +1,122 @@ +/** + * mmParser + * Copyright 2010 by Marc Nause, marc.nause@gmx.de, Braunschweig, Germany + * First released 27.12.2010 at http://yacy.net + * + * $LastChangedDate$ + * $LastChangedRevision$ + * $LastChangedBy$ + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.List; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; + +// this is a new implementation of this parser idiom using multiple documents as result set + +public class mmParser extends AbstractParser implements Parser { + + public mmParser() { + super("FreeMind Parser"); + SUPPORTED_EXTENSIONS.add("mm"); + SUPPORTED_MIME_TYPES.add("application/freemind"); + } + + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException + { + final StringBuilder sb = new StringBuilder(); + String rootElementText = ""; + byte[] content = new byte[0]; + + try { + final SAXParser saxParser = SAXParserFactory.newInstance().newSAXParser(); + final FreeMindHandler freeMindHandler = new FreeMindHandler(); + saxParser.parse(source, freeMindHandler); + + final List nodeTextList = freeMindHandler.getNodeText(); + + rootElementText = (nodeTextList.size() > 0) ? nodeTextList.get(0) : ""; + + for (final String nodeText : nodeTextList) { + sb.append(nodeText); + sb.append(". "); + } + + content = sb.toString().getBytes("UTF-8"); + + } catch (ParserConfigurationException ex) { + log.logWarning(ex.getMessage()); + } catch (SAXException ex) { + log.logWarning(ex.getMessage()); + } catch (IOException ex) { + log.logWarning(ex.getMessage()); + } + + return new Document[]{new Document( + location, + mimeType, + "UTF-8", + null, + null, + rootElementText, + null, + null, + null, + null, + content, + null, + null, + null, + false)}; + } + + private class FreeMindHandler extends DefaultHandler { + + private List nodeText = new ArrayList(); + + @Override + public void startElement(final String uri, final String localName, + final String qName, final Attributes attributes) { + + final String textValue = attributes.getValue("TEXT"); + if (textValue != null) { + nodeText.add(textValue); + } + } + + protected List getNodeText() { + return nodeText; + } + + } +} diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 31f48734a..079fa1c13 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -55,7 +55,10 @@ public class swfParser extends AbstractParser implements Parser { * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document */ - public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { + public Document[] parse(final MultiProtocolURI location, final String mimeType, + final String charset, final InputStream source) + throws Parser.Failure, InterruptedException + { try { final SWF2HTML swf2html = new SWF2HTML();