Reads document level included title and description and skips the graphic content to save bandwidth. svg metadata element is not interpreted - remove rdfParser from init (current function identical with genericParser)pull/18/head
parent
bad34804fe
commit
c647d899e3
@ -0,0 +1,257 @@
|
||||
/**
|
||||
* svgParser.java
|
||||
* Copyright 2015 by Burkhard Buelte
|
||||
* First released 26.09.2015 at http://yacy.net
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or modify it under
|
||||
* the terms of the GNU Lesser General Public License as published by the Free
|
||||
* Software Foundation; either version 2.1 of the License, or (at your option)
|
||||
* any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
|
||||
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt If not, see
|
||||
* <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
package net.yacy.document.parser.images;
|
||||
|
||||
import java.io.EOFException;
|
||||
import java.io.InputStream;
|
||||
import java.util.LinkedHashMap;
|
||||
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.cora.document.id.DigestURL;
|
||||
import net.yacy.cora.document.id.MultiProtocolURL;
|
||||
import net.yacy.cora.util.ConcurrentLog;
|
||||
import net.yacy.cora.util.NumberTools;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.VocabularyScraper;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
/**
|
||||
* Metadata parser for svg image files (which are xml files) SVG 1.1 (Second Edition)
|
||||
* http://www.w3.org/TR/SVG/metadata.html#MetadataElement according to SVG 1.1
|
||||
* parser stops parsing after the first metadata elment has been read and
|
||||
* document level metadata are expected picture data (as proposed in spec) like
|
||||
* <svg>
|
||||
* <title></title>
|
||||
* <desc></desc>
|
||||
* <metadata></metadata>
|
||||
* <... other/>
|
||||
* </svg>
|
||||
*/
|
||||
public class svgParser extends AbstractParser implements Parser {
|
||||
|
||||
public svgParser() {
|
||||
super("SVG Image Parser");
|
||||
this.SUPPORTED_EXTENSIONS.add("svg");
|
||||
this.SUPPORTED_MIME_TYPES.add("image/svg+xml");
|
||||
}
|
||||
|
||||
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
|
||||
|
||||
private static SAXParser getParser() throws SAXException {
|
||||
SAXParser parser = tlSax.get();
|
||||
if (parser == null) {
|
||||
try {
|
||||
parser = SAXParserFactory.newInstance().newSAXParser();
|
||||
} catch (final ParserConfigurationException e) {
|
||||
throw new SAXException(e.getMessage(), e);
|
||||
}
|
||||
tlSax.set(parser);
|
||||
}
|
||||
return parser;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(
|
||||
final AnchorURL location,
|
||||
final String mimeType,
|
||||
final String charset,
|
||||
final VocabularyScraper scraper,
|
||||
final int timezoneOffset,
|
||||
final InputStream source) throws Parser.Failure, InterruptedException {
|
||||
|
||||
try {
|
||||
final SAXParser saxParser = getParser();
|
||||
final svgMetaDataHandler metaData = new svgMetaDataHandler();
|
||||
try {
|
||||
saxParser.parse(source, metaData);
|
||||
} catch (SAXException e) {
|
||||
// catch EOFException which is intentionally thrown after capturing metadata to skip further reading (not a error, just a way to get out of SAX)
|
||||
if (e.getException() == null || !(e.getException() instanceof EOFException)) {
|
||||
throw new Parser.Failure("Unexpected error while parsing svg file. " + e.getMessage(), location);
|
||||
}
|
||||
}
|
||||
|
||||
String docTitle = metaData.getTitle();
|
||||
if (docTitle == null) { // use filename like in genericParser
|
||||
docTitle = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(location.getFileName()); //
|
||||
}
|
||||
String docDescription = metaData.getDescription();
|
||||
if (docDescription == null) { // use url token as in genericParser
|
||||
docDescription = location.toTokens();
|
||||
}
|
||||
|
||||
LinkedHashMap<DigestURL, ImageEntry> images = null;
|
||||
// add this image to the map of images to register size (as in genericImageParser)
|
||||
if (metaData.getHeight() != null && metaData.getWidth() != null) {
|
||||
images = new LinkedHashMap<DigestURL, ImageEntry>();
|
||||
images.put(location, new ImageEntry(location, "", metaData.getWidth(), metaData.getHeight(), -1));
|
||||
}
|
||||
|
||||
// create the parser document
|
||||
Document[] docs = new Document[]{new Document(
|
||||
location,
|
||||
mimeType,
|
||||
"UTF-8",
|
||||
this,
|
||||
null,
|
||||
null,
|
||||
AbstractParser.singleList(docTitle),
|
||||
null,
|
||||
"",
|
||||
null,
|
||||
null,
|
||||
0.0f, 0.0f,
|
||||
docDescription, // text - for this image description is best text we have
|
||||
null,
|
||||
null,
|
||||
images,
|
||||
false,
|
||||
null)};
|
||||
return docs;
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof InterruptedException) {
|
||||
throw (InterruptedException) e;
|
||||
}
|
||||
if (e instanceof Parser.Failure) {
|
||||
throw (Parser.Failure) e;
|
||||
}
|
||||
|
||||
ConcurrentLog.logException(e);
|
||||
throw new Parser.Failure("Unexpected error while parsing odt file. " + e.getMessage(), location);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* SAX handler for svg metadata
|
||||
*/
|
||||
public class svgMetaDataHandler extends DefaultHandler {
|
||||
|
||||
private final StringBuilder buffer = new StringBuilder();
|
||||
private boolean scrapeMetaData = false; // true if within metadata tag
|
||||
|
||||
private String docTitle = null; // document level title
|
||||
private String docDescription = null; // document level description
|
||||
private String imgWidth = null; // size in pixel
|
||||
private String imgHeight = null;
|
||||
|
||||
public svgMetaDataHandler() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(final char ch[], final int start, final int length) {
|
||||
buffer.append(ch, start, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
|
||||
if (scrapeMetaData) {
|
||||
// not implemented yet TODO: interprete RDF content
|
||||
// may contain RDF + DC, DC, CC ...
|
||||
} else {
|
||||
if (null != tag) {
|
||||
switch (tag) {
|
||||
case "svg":
|
||||
imgHeight = atts.getValue("height");
|
||||
imgWidth = atts.getValue("width");
|
||||
break;
|
||||
case "metadata":
|
||||
scrapeMetaData = true;
|
||||
break;
|
||||
// some common graph elements as stop condition (skip reading remainder of input), metadata is expected before graphic content
|
||||
case "g":
|
||||
case "line":
|
||||
case "path":
|
||||
case "rect":
|
||||
throw new SAXException("EOF svg Metadata", new EOFException());
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer.delete(0, buffer.length());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(final String uri, final String name, final String tag) throws SAXException {
|
||||
if (scrapeMetaData) {
|
||||
// stop condition, scrape only first metadata element
|
||||
if ("metadata".equals(tag)) {
|
||||
scrapeMetaData = false;
|
||||
buffer.delete(0, buffer.length());
|
||||
// we have read metadate, other data are not of interest here, end parsing
|
||||
throw new SAXException("EOF svg Metadata", new EOFException());
|
||||
}
|
||||
} else if ("title".equals(tag)) {
|
||||
this.docTitle = buffer.toString();
|
||||
} else if ("desc".equals(tag)) {
|
||||
this.docDescription = buffer.toString();
|
||||
}
|
||||
buffer.delete(0, buffer.length());
|
||||
}
|
||||
|
||||
/**
|
||||
* @return document level title or null
|
||||
*/
|
||||
public String getTitle() {
|
||||
return docTitle;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return document level description or null
|
||||
*/
|
||||
public String getDescription() {
|
||||
return docDescription;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return image width in pixel or null
|
||||
*/
|
||||
public Integer getWidth() {
|
||||
if (imgWidth != null) {
|
||||
// return number if given in pixel or a number only, return nothing for size like "100%"
|
||||
if ((imgWidth.indexOf("px") > 0) || ((imgWidth.charAt(imgWidth.length() - 1) >= '0' && imgWidth.charAt(imgWidth.length() - 1) <= '9'))) {
|
||||
return NumberTools.parseIntDecSubstring(imgWidth);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return image height in pixel or null
|
||||
*/
|
||||
public Integer getHeight() {
|
||||
if (imgHeight != null) {
|
||||
// return number if given in pixel or a number only, return nothing for size like "100%"
|
||||
if ((imgHeight.indexOf("px") > 0) || ((imgHeight.charAt(imgHeight.length() - 1) >= '0' && imgHeight.charAt(imgHeight.length() - 1) <= '9'))) {
|
||||
return NumberTools.parseIntDecSubstring(imgHeight);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue