|
|
|
/**
|
|
|
|
* svgParser.java
|
|
|
|
* Copyright 2015 by Burkhard Buelte
|
|
|
|
* First released 26.09.2015 at http://yacy.net
|
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or modify it under
|
|
|
|
* the terms of the GNU Lesser General Public License as published by the Free
|
|
|
|
* Software Foundation; either version 2.1 of the License, or (at your option)
|
|
|
|
* any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful, but WITHOUT
|
|
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
|
|
|
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
|
|
|
* details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Lesser General Public License
|
|
|
|
* along with this program in the file lgpl21.txt If not, see
|
|
|
|
* <http://www.gnu.org/licenses/>.
|
|
|
|
*/
|
|
|
|
package net.yacy.document.parser.images;
|
|
|
|
|
|
|
|
import java.io.EOFException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
import java.util.LinkedHashMap;
|
|
|
|
|
|
|
|
import javax.xml.parsers.ParserConfigurationException;
|
|
|
|
import javax.xml.parsers.SAXParser;
|
|
|
|
import javax.xml.parsers.SAXParserFactory;
|
|
|
|
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
|
|
import net.yacy.cora.document.id.MultiProtocolURL;
|
|
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
|
|
import net.yacy.cora.util.NumberTools;
|
|
|
|
import net.yacy.document.AbstractParser;
|
|
|
|
import net.yacy.document.Document;
|
|
|
|
import net.yacy.document.Parser;
|
|
|
|
import net.yacy.document.VocabularyScraper;
|
|
|
|
import net.yacy.document.parser.html.ImageEntry;
|
|
|
|
|
|
|
|
import org.xml.sax.Attributes;
|
|
|
|
import org.xml.sax.SAXException;
|
|
|
|
import org.xml.sax.helpers.DefaultHandler;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Metadata parser for svg image files (which are xml files) SVG 1.1 (Second Edition)
|
|
|
|
* http://www.w3.org/TR/SVG/metadata.html#MetadataElement according to SVG 1.1
|
|
|
|
* parser stops parsing after the first metadata elment has been read and
|
|
|
|
* document level metadata are expected picture data (as proposed in spec) like
|
|
|
|
* <svg>
|
|
|
|
* <title></title>
|
|
|
|
* <desc></desc>
|
|
|
|
* <metadata></metadata>
|
|
|
|
* <... other/>
|
|
|
|
* </svg>
|
|
|
|
*/
|
|
|
|
public class svgParser extends AbstractParser implements Parser {
|
|
|
|
|
|
|
|
public svgParser() {
|
|
|
|
super("SVG Image Parser");
|
|
|
|
this.SUPPORTED_EXTENSIONS.add("svg");
|
|
|
|
this.SUPPORTED_MIME_TYPES.add("image/svg+xml");
|
|
|
|
}
|
|
|
|
|
|
|
|
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
|
|
|
|
|
|
|
|
private static SAXParser getParser() throws SAXException {
|
|
|
|
SAXParser parser = tlSax.get();
|
|
|
|
if (parser == null) {
|
|
|
|
try {
|
|
|
|
parser = SAXParserFactory.newInstance().newSAXParser();
|
|
|
|
} catch (final ParserConfigurationException e) {
|
|
|
|
throw new SAXException(e.getMessage(), e);
|
|
|
|
}
|
|
|
|
tlSax.set(parser);
|
|
|
|
}
|
|
|
|
return parser;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public Document[] parse(
|
|
|
|
final DigestURL location,
|
|
|
|
final String mimeType,
|
|
|
|
final String charset,
|
|
|
|
final VocabularyScraper scraper,
|
|
|
|
final int timezoneOffset,
|
|
|
|
final InputStream source) throws Parser.Failure, InterruptedException {
|
|
|
|
|
|
|
|
try {
|
|
|
|
final SAXParser saxParser = getParser();
|
|
|
|
final svgMetaDataHandler metaData = new svgMetaDataHandler();
|
|
|
|
try {
|
|
|
|
saxParser.parse(source, metaData);
|
|
|
|
} catch (SAXException e) {
|
|
|
|
// catch EOFException which is intentionally thrown after capturing metadata to skip further reading (not a error, just a way to get out of SAX)
|
|
|
|
if (e.getException() == null || !(e.getException() instanceof EOFException)) {
|
|
|
|
throw new Parser.Failure("Unexpected error while parsing svg file. " + e.getMessage(), location);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
String docTitle = metaData.getTitle();
|
|
|
|
if (docTitle == null) { // use filename like in genericParser
|
|
|
|
docTitle = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(location.getFileName()); //
|
|
|
|
}
|
|
|
|
String docDescription = metaData.getDescription();
|
|
|
|
if (docDescription == null) { // use url token as in genericParser
|
|
|
|
docDescription = location.toTokens();
|
|
|
|
}
|
|
|
|
|
|
|
|
LinkedHashMap<DigestURL, ImageEntry> images = null;
|
|
|
|
// add this image to the map of images to register size (as in genericImageParser)
|
|
|
|
if (metaData.getHeight() != null && metaData.getWidth() != null) {
|
|
|
|
images = new LinkedHashMap<DigestURL, ImageEntry>();
|
|
|
|
images.put(location, new ImageEntry(location, "", metaData.getWidth(), metaData.getHeight(), -1));
|
|
|
|
}
|
|
|
|
|
|
|
|
// create the parser document
|
|
|
|
Document[] docs = new Document[]{new Document(
|
|
|
|
location,
|
|
|
|
mimeType,
|
|
|
|
StandardCharsets.UTF_8.name(),
|
|
|
|
this,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
AbstractParser.singleList(docTitle),
|
|
|
|
null,
|
|
|
|
"",
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
0.0d, 0.0d,
|
|
|
|
docDescription, // text - for this image description is best text we have
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
images,
|
|
|
|
false,
|
|
|
|
null)};
|
|
|
|
return docs;
|
|
|
|
} catch (final Exception e) {
|
|
|
|
if (e instanceof InterruptedException) {
|
|
|
|
throw (InterruptedException) e;
|
|
|
|
}
|
|
|
|
if (e instanceof Parser.Failure) {
|
|
|
|
throw (Parser.Failure) e;
|
|
|
|
}
|
|
|
|
|
|
|
|
ConcurrentLog.logException(e);
|
|
|
|
throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(), location);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* SAX handler for svg metadata
|
|
|
|
*/
|
|
|
|
public class svgMetaDataHandler extends DefaultHandler {
|
|
|
|
|
|
|
|
private final StringBuilder buffer = new StringBuilder();
|
|
|
|
private boolean scrapeMetaData = false; // true if within metadata tag
|
|
|
|
private boolean svgStartTagFound = false; // switch to recognize start tag processing, to cancel parsing on wrong tag
|
|
|
|
|
|
|
|
private String docTitle = null; // document level title
|
|
|
|
private String docDescription = null; // document level description
|
|
|
|
private String imgWidth = null; // size in pixel
|
|
|
|
private String imgHeight = null;
|
|
|
|
|
|
|
|
public svgMetaDataHandler() {
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void characters(final char ch[], final int start, final int length) {
|
|
|
|
buffer.append(ch, start, length);
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
|
|
|
if (scrapeMetaData) {
|
|
|
|
// not implemented yet TODO: interprete RDF content
|
|
|
|
// may contain RDF + DC, DC, CC ...
|
|
|
|
} else {
|
|
|
|
if (tag != null) {
|
|
|
|
switch (tag) {
|
|
|
|
case "svg":
|
|
|
|
svgStartTagFound = true;
|
|
|
|
imgHeight = atts.getValue("height");
|
|
|
|
imgWidth = atts.getValue("width");
|
|
|
|
break;
|
|
|
|
case "metadata":
|
|
|
|
scrapeMetaData = true;
|
|
|
|
break;
|
|
|
|
// some common graph elements as stop condition (skip reading remainder of input), metadata is expected before graphic content
|
|
|
|
case "g":
|
|
|
|
case "line":
|
|
|
|
case "path":
|
|
|
|
case "rect":
|
|
|
|
throw new SAXException("EOF svg Metadata", new EOFException());
|
|
|
|
default : { // K.O. criteria, start tag is not svg, fail parser on none svg
|
|
|
|
if (!svgStartTagFound) {
|
|
|
|
throw new SAXException("not a svg file, start tag "+tag, new Failure());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
buffer.delete(0, buffer.length());
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void endElement(final String uri, final String name, final String tag) throws SAXException {
|
|
|
|
if (scrapeMetaData) {
|
|
|
|
// stop condition, scrape only first metadata element
|
|
|
|
if ("metadata".equals(tag)) {
|
|
|
|
scrapeMetaData = false;
|
|
|
|
buffer.delete(0, buffer.length());
|
|
|
|
// we have read metadate, other data are not of interest here, end parsing
|
|
|
|
throw new SAXException("EOF svg Metadata", new EOFException());
|
|
|
|
}
|
|
|
|
} else if ("title".equals(tag)) {
|
|
|
|
this.docTitle = buffer.toString();
|
|
|
|
} else if ("desc".equals(tag)) {
|
|
|
|
this.docDescription = buffer.toString();
|
|
|
|
}
|
|
|
|
buffer.delete(0, buffer.length());
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return document level title or null
|
|
|
|
*/
|
|
|
|
public String getTitle() {
|
|
|
|
return docTitle;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return document level description or null
|
|
|
|
*/
|
|
|
|
public String getDescription() {
|
|
|
|
return docDescription;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return image width in pixel or null
|
|
|
|
*/
|
|
|
|
public Integer getWidth() {
|
|
|
|
if (imgWidth != null) {
|
|
|
|
// return number if given in pixel or a number only, return nothing for size like "100%"
|
|
|
|
if ((imgWidth.indexOf("px") > 0) || ((imgWidth.charAt(imgWidth.length() - 1) >= '0' && imgWidth.charAt(imgWidth.length() - 1) <= '9'))) {
|
|
|
|
return NumberTools.parseIntDecSubstring(imgWidth);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* @return image height in pixel or null
|
|
|
|
*/
|
|
|
|
public Integer getHeight() {
|
|
|
|
if (imgHeight != null) {
|
|
|
|
// return number if given in pixel or a number only, return nothing for size like "100%"
|
|
|
|
if ((imgHeight.indexOf("px") > 0) || ((imgHeight.charAt(imgHeight.length() - 1) >= '0' && imgHeight.charAt(imgHeight.length() - 1) <= '9'))) {
|
|
|
|
return NumberTools.parseIntDecSubstring(imgHeight);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|