added metadataImageParser for tif and psd (Photoshop) images.

This is a modified genericImageParser adding tif (and psd) support even if java ImageIO plugin for tif is not installed in JDK. Adds just tif and psd to the available parsers. Uses the same library to extract metadata, so could eventually be merged with genericImageParser. All detected metadata are added to the parsed document (potentially some more as with genericImageParser)
11 years ago · eaccce3467
parent a69f5358ff
commit eaccce3467
3 changed files with 180 additions and 2 deletions
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -64,6 +64,7 @@ import net.yacy.document.parser.xlsParser;
 import net.yacy.document.parser.zipParser;
 import net.yacy.document.parser.augment.AugmentParser;
 import net.yacy.document.parser.images.genericImageParser;
+import net.yacy.document.parser.images.metadataImageParser;
 import net.yacy.document.parser.rdfa.impl.RDFaParser;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.MemoryControl;
@ -92,6 +93,7 @@ public final class TextParser {
        if (Switchboard.getSwitchboard().getConfigBool("parserAugmentation.RDFa", true)) initParser(new RDFaParser());          
        initParser(new htmlParser()); // called within rdfa parser
        initParser(new genericImageParser());
+        initParser(new metadataImageParser());
        initParser(new linkScraperParser());
        initParser(new mmParser());
        initParser(new odtParser());
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -197,7 +197,6 @@ public class genericImageParser extends AbstractParser implements Parser {
        }

        final HashSet<String> languages = new HashSet<String>();
-        final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
        final LinkedHashMap<DigestURL, ImageEntry> images  = new LinkedHashMap<>();
        // add this image to the map of images
        final String infoString = ii.info.toString();
@ -219,7 +218,7 @@ public class genericImageParser extends AbstractParser implements Parser {
             descriptions, // description
             gpslon, gpslat, //  location
             infoString, // content text
-             anchors, // anchors
+             null, // anchors
             null,
             images,
             false,
--- a/source/net/yacy/document/parser/images/metadataImageParser.java
+++ b/source/net/yacy/document/parser/images/metadataImageParser.java
@ -0,0 +1,177 @@
+// metadataImageParser.java
+// (C) 2014 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 30.09.2014 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate$
+// $LastChangedRevision$
+// $LastChangedBy$
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser.images;
+
+import com.drew.imaging.ImageMetadataReader;
+import com.drew.imaging.ImageProcessingException;
+import com.drew.lang.GeoLocation;
+import com.drew.metadata.Directory;
+import com.drew.metadata.Metadata;
+import com.drew.metadata.Tag;
+import com.drew.metadata.exif.GpsDirectory;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.BufferedInputStream;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+
+
+/**
+ * Image parser base on drewnoakes.com metadata-extractor which supports
+ * metadata extraction from bmp, gif, jpeg, png, psd, tiff
+ * All discovered metadata are added to the parsed document
+ *
+ * http://www.drewnoakes.com/drewnoakes.com/code/exif/
+ *
+ * (in difference to genericImageParser javax ImageIO is not used,
+ * to support tiff parsing also if not supported by ImageIO)
+ */
+public class metadataImageParser extends AbstractParser implements Parser {
+
+    public metadataImageParser() {
+        super("Metadata Image Parser");
+        
+        SUPPORTED_EXTENSIONS.add("tif");
+        SUPPORTED_EXTENSIONS.add("psd");
+        // only used for ext/mime not covered by genericImageParser's default
+        //SUPPORTED_EXTENSIONS.add("gif");
+        //SUPPORTED_EXTENSIONS.add("jpg");
+        //SUPPORTED_EXTENSIONS.add("jpeg");
+        //SUPPORTED_EXTENSIONS.add("png");
+
+        SUPPORTED_MIME_TYPES.add("image/tiff");
+        SUPPORTED_MIME_TYPES.add("image/vnd.adobe.photoshop");
+        SUPPORTED_MIME_TYPES.add("image/x-photoshop");
+        //SUPPORTED_MIME_TYPES.add("image/gif");
+        //SUPPORTED_MIME_TYPES.add("image/jpeg");
+        //SUPPORTED_MIME_TYPES.add("image/png");
+    }
+
+    @Override
+    public Document[] parse(
+            final AnchorURL location,
+            final String mimeType,
+            final String documentCharset,
+            final InputStream sourceStream) throws Parser.Failure, InterruptedException {
+
+        String title = null;
+        String author = null;
+        String keywords = null;
+        List<String> descriptions = new ArrayList<String>();
+        double gpslat = 0;
+        double gpslon = 0;
+        StringBuilder imgInfotxt = new StringBuilder();
+
+        try {
+            final Metadata metadata = ImageMetadataReader.readMetadata(new BufferedInputStream(sourceStream), false);
+
+            final Iterator<Directory> directories = metadata.getDirectories().iterator();
+            final HashMap<String, String> props = new HashMap<String, String>();
+            while (directories.hasNext()) {
+                final Directory directory = directories.next();
+                if (directory instanceof GpsDirectory) { // extracting GPS location
+                    GeoLocation geoloc = ((GpsDirectory) directory).getGeoLocation();
+                    if (geoloc != null) {
+                        gpslat = geoloc.getLatitude();
+                        gpslon = geoloc.getLongitude();
+                    }
+                } else {
+                    final Iterator<Tag> tags = directory.getTags().iterator();
+                    while (tags.hasNext()) {
+                        final Tag tag = tags.next();
+                        if (!tag.getTagName().startsWith("Unknown")) { // filter out returned TagName of "Unknown tag"
+                            props.put(tag.getTagName(), tag.getDescription());
+                            imgInfotxt.append(tag.getTagName() + ": " + tag.getDescription() + " .\n");
+                        }
+                    }
+                }
+            }
+
+            title = props.get("Image Description");
+            if (title == null || title.isEmpty()) title = props.get("Headline");
+            if (title == null || title.isEmpty()) title = props.get("Object Name");
+
+            author = props.get("Artist");
+            if (author == null || author.isEmpty()) author = props.get("Writer/Editor");
+            if (author == null || author.isEmpty()) author = props.get("By-line");
+            if (author == null || author.isEmpty()) author = props.get("Credit");
+            if (author == null || author.isEmpty()) author = props.get("Make");
+            
+            keywords = props.get("Keywords");
+            if (keywords == null || keywords.isEmpty()) keywords = props.get("Category");
+            if (keywords == null || keywords.isEmpty()) keywords = props.get("Supplemental Category(s)");            
+
+            String description;
+            description = props.get("Caption/Abstract");
+            if (description != null && description.length() > 0) descriptions.add("Abstract: " + description);
+            description = props.get("Country/Primary Location");
+            if (description != null && description.length() > 0) descriptions.add("Location: " + description);
+            description = props.get("Province/State");
+            if (description != null && description.length() > 0) descriptions.add("State: " + description);
+            description = props.get("Copyright Notice");
+            if (description != null && description.length() > 0) descriptions.add("Copyright: " + description);
+
+        } catch (ImageProcessingException e) {
+            throw new Parser.Failure("could not extract image meta data", location);
+        } catch (IOException ex) {
+            throw new Parser.Failure("IO-Error reading", location);
+        }
+
+        if (title == null || title.isEmpty()) {
+            title = MultiProtocolURL.unescape(location.getFileName());
+        }
+
+        return new Document[]{new Document(
+            location,
+            mimeType,
+            "UTF-8",
+            this,
+            new HashSet<String>(0), // languages
+            keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
+            singleList(title), // title
+            author == null ? "" : author, // author
+            location.getHost(), // Publisher
+            new String[]{}, // sections
+            descriptions, // description
+            gpslon, gpslat, //  location
+            imgInfotxt.toString(), // content text
+            null, // anchors
+            null, // rss
+            null, // images
+            false,
+            new Date())}; // images
+    }
+}