added exif parsing for jpg images

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6745 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 82f76e1296
commit 24e5faee75

@ -1,42 +1,43 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="gen"/>
</classpath>
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/util"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry exported="true" kind="lib" path="lib/commons-httpclient-3.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-io-1.4.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-fileupload-1.2.1.jar"/>
<classpathentry exported="true" kind="lib" path="lib/servlet-api.jar"/>
<classpathentry exported="true" kind="lib" path="lib/xerces.jar"/>
<classpathentry exported="true" kind="lib" path="lib/bzip2.jar"/>
<classpathentry exported="true" kind="lib" path="lib/J7Zip-modified.jar"/>
<classpathentry exported="true" kind="lib" path="lib/webcat-0.1-swf.jar"/>
<classpathentry exported="true" kind="lib" path="lib/activation.jar"/>
<classpathentry exported="true" kind="lib" path="lib/commons-jxpath-1.3.jar"/>
<classpathentry exported="true" kind="lib" path="libt/junit-4.7.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.0.0.jar"/>
<classpathentry kind="lib" path="lib/poi-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/poi-scratchpad-3.6-20091214.jar"/>
<classpathentry kind="lib" path="lib/bcmail-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/bcprov-jdk15-145.jar"/>
<classpathentry kind="lib" path="lib/jsch-0.1.42.jar"/>
<classpathentry kind="lib" path="lib/jakarta-oro-2.0.8.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.4.jar"/>
<classpathentry kind="lib" path="lib/log4j-1.2.15.jar"/>
<classpathentry kind="lib" path="lib/mysql-connector-java-5.1.12-bin.jar"/>
<classpathentry kind="lib" path="lib/jcifs-1.3.14.jar"/>
<classpathentry kind="lib" path="lib/metadata-extractor-2.4.0-beta-1.jar"/>
<classpathentry exported="true" kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -0,0 +1,16 @@
/*
* This is public domain software - that is, you can do whatever you want
* with it, and include it software that is licensed under the GNU or the
* BSD license, or whatever other licence you choose, including proprietary
* closed source licenses. I do ask that you leave this header in tact.
*
* If you make modifications to this code that you think would benefit the
* wider community, please send me a copy and I'll post it on my site.
*
* If you make use of this code, I'd appreciate hearing about it.
* metadata_extractor [at] drewnoakes [dot] com
* Latest version of this software kept at
* http://drewnoakes.com/
*
* Created by Darren Salomons & Drew Noakes.
*/

@ -86,7 +86,6 @@ public final class TextParser {
private static final Set<String> denyExtension = new TreeSet<String>(insensitiveCollator);
static {
initParser(new bmpParser());
initParser(new bzipParser());
initParser(new csvParser());
initParser(new docParser());

@ -25,27 +25,16 @@
package net.yacy.document.parser.images;
import java.awt.image.BufferedImage;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import javax.imageio.ImageIO;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
public class bmpParser extends AbstractParser implements Idiom {
public class bmpParser {
// this is a implementation of http://de.wikipedia.org/wiki/Windows_Bitmap
@ -60,97 +49,12 @@ public class bmpParser extends AbstractParser implements Idiom {
//private static int BI_RLE4 = 2;
//private static int BI_BITFIELDS = 3;
//boolean debugmode = false;
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_EXTENSIONS.add("bmp");
SUPPORTED_MIME_TYPES.add("image/bmp");
}
public bmpParser() {
super("BMP Image Parser");
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public static final boolean isBMP(final byte[] source) {
// check the file magic
return (source != null) && (source.length >= 2) && (source[0] == 'B') && (source[1] == 'M');
}
@Override
public Document parse(
final DigestURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
BufferedImage image = null;
try {
image = ImageIO.read(sourceStream);
} catch (final EOFException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
} catch (final IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
}
if (image == null) throw new ParserException("ImageIO returned NULL", location);
// scan the image
int height = image.getHeight();
int width = image.getWidth();
/*
Raster raster = image.getData();
int[] pixel = raster.getPixel(0, 0, (int[])null);
long[] average = new long[pixel.length];
for (int i = 0; i < average.length; i++) average[i] = 0L;
int pc = 0;
for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
pixel = raster.getPixel(x, y, pixel);
for (int i = 0; i < average.length; i++) average[i] += pixel[i];
pc++;
}
}
*/
// get image properties
String [] propNames = image.getPropertyNames();
if (propNames == null) propNames = new String[0];
StringBuilder sb = new StringBuilder(propNames.length * 80);
for (String propName: propNames) {
sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
}
final HashSet<String> languages = new HashSet<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
// add this image to the map of images
images.put(sb.toString(), new ImageEntry(location, "", width, height, -1));
return new Document(
location,
mimeType,
"UTF-8",
languages,
new String[]{}, // keywords
"", // title
"", // author
new String[]{}, // sections
"", // description
sb.toString().getBytes(), // content text
anchors, // anchors
images,
false); // images
}
public static IMAGEMAP parse(final byte[] source) {
// read info-header
final int bfOffBits = DWORD(source, FILEHEADER_offset + 10);

@ -35,17 +35,30 @@ import java.io.IOException;
import java.io.InputStream;import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import javax.imageio.ImageIO;
import com.drew.imaging.jpeg.JpegMetadataReader;
import com.drew.metadata.Directory;
import com.drew.metadata.Metadata;
import com.drew.metadata.MetadataException;
import com.drew.metadata.Tag;
import com.sun.image.codec.jpeg.ImageFormatException;
import com.sun.image.codec.jpeg.JPEGCodec;
import com.sun.image.codec.jpeg.JPEGDecodeParam;
import com.sun.image.codec.jpeg.JPEGImageDecoder;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
public class genericImageParser extends AbstractParser implements Idiom {
@ -61,9 +74,11 @@ public class genericImageParser extends AbstractParser implements Idiom {
SUPPORTED_EXTENSIONS.add("jpg");
SUPPORTED_EXTENSIONS.add("jpeg");
SUPPORTED_EXTENSIONS.add("jpe");
SUPPORTED_EXTENSIONS.add("bmp");
SUPPORTED_MIME_TYPES.add("image/png");
SUPPORTED_MIME_TYPES.add("image/gif");
SUPPORTED_MIME_TYPES.add("image/jpg");
SUPPORTED_MIME_TYPES.add("image/bmp");
}
public genericImageParser() {
@ -76,6 +91,119 @@ public class genericImageParser extends AbstractParser implements Idiom {
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
ImageInfo ii = null;
String title = null;
String author = null;
String keywords = null;
String description = null;
if (mimeType.equals("image/bmp") ||
location.getFileExtension().equals("bmp")) {
byte[] b;
try {
b = FileUtils.read(sourceStream);
} catch (IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
}
IMAGEMAP imap = bmpParser.parse(b);
ii = parseJavaImage(location, imap.getImage());
} else if (mimeType.equals("image/jpg") ||
location.getFileExtension().equals("jpg") ||
location.getFileExtension().equals("jpeg") ||
location.getFileExtension().equals("jpe")) {
// use the exif parser from
// http://www.drewnoakes.com/drewnoakes.com/code/exif/
// javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/
// a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html
JPEGImageDecoder jpegDecoder = JPEGCodec.createJPEGDecoder(sourceStream);
BufferedImage image = null;
try {
image = jpegDecoder.decodeAsBufferedImage();
} catch (ImageFormatException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
} catch (IOException e) {
Log.logException(e);
throw new ParserException(e.getMessage(), location);
}
JPEGDecodeParam decodeParam = jpegDecoder.getJPEGDecodeParam();
Metadata metadata = JpegMetadataReader.readMetadata(decodeParam);
ii = parseJavaImage(location, image);
Iterator<Directory> directories = (Iterator<Directory>) metadata.getDirectoryIterator();
HashMap<String, String> props = new HashMap<String, String>();
while (directories.hasNext()) {
Directory directory = directories.next();
Iterator<Tag> tags = (Iterator<Tag>) directory.getTagIterator();
while (tags.hasNext()) {
Tag tag = tags.next();
try {
props.put(tag.getTagName(), tag.getDescription());
ii.info.append(tag.getTagName() + ": " + tag.getDescription() + " .\n");
} catch (MetadataException e) {
Log.logException(e);
}
}
title = props.get("Image Description");
if (title == null || title.length() == 0) title = props.get("Headline");
if (title == null || title.length() == 0) title = props.get("Object Name");
author = props.get("Artist");
if (author == null || author.length() == 0) author = props.get("Writer/Editor");
if (author == null || author.length() == 0) author = props.get("By-line");
if (author == null || author.length() == 0) author = props.get("Credit");
if (author == null || author.length() == 0) author = props.get("Make");
keywords = props.get("Keywords");
if (keywords == null || keywords.length() == 0) keywords = props.get("Category");
if (keywords == null || keywords.length() == 0) keywords = props.get("Supplemental Category(s)");
description = props.get("Caption/Abstract");
if (description == null || description.length() == 0) description = props.get("Country/Primary Location");
if (description == null || description.length() == 0) description = props.get("Province/State");
if (description == null || description.length() == 0) description = props.get("Copyright Notice");
}
} else {
ii = parseJavaImage(location, sourceStream);
}
final HashSet<String> languages = new HashSet<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
// add this image to the map of images
String infoString = ii.info.toString();
images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1));
if (title == null) title = location.toNormalform(true, true);
return new Document(
location,
mimeType,
"UTF-8",
languages,
keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
title, // title
author == null ? location.getHost() : author, // author
new String[]{}, // sections
description == null ? "" : description, // description
infoString.getBytes(), // content text
anchors, // anchors
images,
false); // images
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
public static ImageInfo parseJavaImage(
final DigestURI location,
final InputStream sourceStream) throws ParserException {
BufferedImage image = null;
try {
image = ImageIO.read(sourceStream);
@ -87,10 +215,18 @@ public class genericImageParser extends AbstractParser implements Idiom {
throw new ParserException(e.getMessage(), location);
}
if (image == null) throw new ParserException("ImageIO returned NULL", location);
return parseJavaImage(location, image);
}
public static ImageInfo parseJavaImage(
final DigestURI location,
final BufferedImage image) throws ParserException {
ImageInfo ii = new ImageInfo(location);
ii.image = image;
// scan the image
int height = image.getHeight();
int width = image.getWidth();
ii.height = ii.image.getHeight();
ii.width = ii.image.getWidth();
/*
Raster raster = image.getData();
int[] pixel = raster.getPixel(0, 0, (int[])null);
@ -106,53 +242,36 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
*/
// get image properties
String [] propNames = image.getPropertyNames();
String [] propNames = ii.image.getPropertyNames();
if (propNames == null) propNames = new String[0];
StringBuilder sb = new StringBuilder(propNames.length * 80);
sb.append("\n");
ii.info.append("\n");
for (String propName: propNames) {
sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
ii.info.append(propName).append(" = ").append(ii.image.getProperty(propName)).append(" .\n");
}
// append also properties that we measured
sb.append("width").append(" = ").append(Integer.toString(width)).append(" .\n");
sb.append("height").append(" = ").append(Integer.toString(height)).append(" .\n");
ii.info.append("width").append(": ").append(Integer.toString(ii.width)).append(" .\n");
ii.info.append("height").append(": ").append(Integer.toString(ii.height)).append(" .\n");
final HashSet<String> languages = new HashSet<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
final HashMap<String, ImageEntry> images = new HashMap<String, ImageEntry>();
// add this image to the map of images
images.put(sb.toString(), new ImageEntry(location, "", width, height, -1));
return new Document(
location,
mimeType,
"UTF-8",
languages,
new String[]{}, // keywords
"", // title
"", // author
new String[]{}, // sections
"", // description
sb.toString().getBytes(), // content text
anchors, // anchors
images,
false); // images
}
/*
* Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {(non-Javadoc)
* @see net.yacy.document.Idiom#supportedMimeTypes()
*/
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
return ii;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
public static class ImageInfo {
public DigestURI location;
public BufferedImage image;
public StringBuilder info;
public int height;
public int width;
public ImageInfo(final DigestURI location) {
this.location = location;
this.image = null;
this.info = new StringBuilder();
this.height = -1;
this.width = -1;
}
}
public static void main(final String[] args) {
File image = new File(args[0]);
genericImageParser parser = new genericImageParser();

Loading…
Cancel
Save