add svgParser to parse metadate from svg images

Reads document level included title and description and skips the graphic content to save bandwidth.
svg metadata element is not interpreted
- remove rdfParser from init (current function identical with genericParser)
pull/18/head
reger 9 years ago
parent bad34804fe
commit c647d899e3

@ -66,6 +66,7 @@ import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.document.parser.images.metadataImageParser;
import net.yacy.document.parser.images.svgParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -105,6 +106,7 @@ public final class TextParser {
initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new sidAudioParser());
initParser(new svgParser());
initParser(new swfParser());
initParser(new tarParser());
initParser(new torrentParser());

@ -0,0 +1,257 @@
/**
* svgParser.java
* Copyright 2015 by Burkhard Buelte
* First released 26.09.2015 at http://yacy.net
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt If not, see
* <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.images;
import java.io.EOFException;
import java.io.InputStream;
import java.util.LinkedHashMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.NumberTools;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
* Metadata parser for svg image files (which are xml files) SVG 1.1 (Second Edition)
* http://www.w3.org/TR/SVG/metadata.html#MetadataElement according to SVG 1.1
* parser stops parsing after the first metadata elment has been read and
* document level metadata are expected picture data (as proposed in spec) like
* <svg>
* <title></title>
* <desc></desc>
* <metadata></metadata>
* <... other/>
* </svg>
*/
public class svgParser extends AbstractParser implements Parser {
public svgParser() {
super("SVG Image Parser");
this.SUPPORTED_EXTENSIONS.add("svg");
this.SUPPORTED_MIME_TYPES.add("image/svg+xml");
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
@Override
public Document[] parse(
final AnchorURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
final SAXParser saxParser = getParser();
final svgMetaDataHandler metaData = new svgMetaDataHandler();
try {
saxParser.parse(source, metaData);
} catch (SAXException e) {
// catch EOFException which is intentionally thrown after capturing metadata to skip further reading (not a error, just a way to get out of SAX)
if (e.getException() == null || !(e.getException() instanceof EOFException)) {
throw new Parser.Failure("Unexpected error while parsing svg file. " + e.getMessage(), location);
}
}
String docTitle = metaData.getTitle();
if (docTitle == null) { // use filename like in genericParser
docTitle = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(location.getFileName()); //
}
String docDescription = metaData.getDescription();
if (docDescription == null) { // use url token as in genericParser
docDescription = location.toTokens();
}
LinkedHashMap<DigestURL, ImageEntry> images = null;
// add this image to the map of images to register size (as in genericImageParser)
if (metaData.getHeight() != null && metaData.getWidth() != null) {
images = new LinkedHashMap<DigestURL, ImageEntry>();
images.put(location, new ImageEntry(location, "", metaData.getWidth(), metaData.getHeight(), -1));
}
// create the parser document
Document[] docs = new Document[]{new Document(
location,
mimeType,
"UTF-8",
this,
null,
null,
AbstractParser.singleList(docTitle),
null,
"",
null,
null,
0.0f, 0.0f,
docDescription, // text - for this image description is best text we have
null,
null,
images,
false,
null)};
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) {
throw (InterruptedException) e;
}
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
}
ConcurrentLog.logException(e);
throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(), location);
}
}
/**
* SAX handler for svg metadata
*/
public class svgMetaDataHandler extends DefaultHandler {
private final StringBuilder buffer = new StringBuilder();
private boolean scrapeMetaData = false; // true if within metadata tag
private String docTitle = null; // document level title
private String docDescription = null; // document level description
private String imgWidth = null; // size in pixel
private String imgHeight = null;
public svgMetaDataHandler() {
}
@Override
public void characters(final char ch[], final int start, final int length) {
buffer.append(ch, start, length);
}
@Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if (scrapeMetaData) {
// not implemented yet TODO: interprete RDF content
// may contain RDF + DC, DC, CC ...
} else {
if (null != tag) {
switch (tag) {
case "svg":
imgHeight = atts.getValue("height");
imgWidth = atts.getValue("width");
break;
case "metadata":
scrapeMetaData = true;
break;
// some common graph elements as stop condition (skip reading remainder of input), metadata is expected before graphic content
case "g":
case "line":
case "path":
case "rect":
throw new SAXException("EOF svg Metadata", new EOFException());
}
}
}
buffer.delete(0, buffer.length());
}
@Override
public void endElement(final String uri, final String name, final String tag) throws SAXException {
if (scrapeMetaData) {
// stop condition, scrape only first metadata element
if ("metadata".equals(tag)) {
scrapeMetaData = false;
buffer.delete(0, buffer.length());
// we have read metadate, other data are not of interest here, end parsing
throw new SAXException("EOF svg Metadata", new EOFException());
}
} else if ("title".equals(tag)) {
this.docTitle = buffer.toString();
} else if ("desc".equals(tag)) {
this.docDescription = buffer.toString();
}
buffer.delete(0, buffer.length());
}
/**
* @return document level title or null
*/
public String getTitle() {
return docTitle;
}
/**
* @return document level description or null
*/
public String getDescription() {
return docDescription;
}
/**
* @return image width in pixel or null
*/
public Integer getWidth() {
if (imgWidth != null) {
// return number if given in pixel or a number only, return nothing for size like "100%"
if ((imgWidth.indexOf("px") > 0) || ((imgWidth.charAt(imgWidth.length() - 1) >= '0' && imgWidth.charAt(imgWidth.length() - 1) <= '9'))) {
return NumberTools.parseIntDecSubstring(imgWidth);
}
}
return null;
}
/**
* @return image height in pixel or null
*/
public Integer getHeight() {
if (imgHeight != null) {
// return number if given in pixel or a number only, return nothing for size like "100%"
if ((imgHeight.indexOf("px") > 0) || ((imgHeight.charAt(imgHeight.length() - 1) >= '0' && imgHeight.charAt(imgHeight.length() - 1) <= '9'))) {
return NumberTools.parseIntDecSubstring(imgHeight);
}
}
return null;
}
}
}
Loading…
Cancel
Save