diff --git a/.classpath b/.classpath
index f8d0809a5..5094c4b77 100644
--- a/.classpath
+++ b/.classpath
@@ -1,42 +1,43 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/lib/metadata-extractor-2.4.0-beta-1.License b/lib/metadata-extractor-2.4.0-beta-1.License
new file mode 100644
index 000000000..15b62f428
--- /dev/null
+++ b/lib/metadata-extractor-2.4.0-beta-1.License
@@ -0,0 +1,16 @@
+/*
+ * This is public domain software - that is, you can do whatever you want
+ * with it, and include it software that is licensed under the GNU or the
+ * BSD license, or whatever other licence you choose, including proprietary
+ * closed source licenses. I do ask that you leave this header in tact.
+ *
+ * If you make modifications to this code that you think would benefit the
+ * wider community, please send me a copy and I'll post it on my site.
+ *
+ * If you make use of this code, I'd appreciate hearing about it.
+ * metadata_extractor [at] drewnoakes [dot] com
+ * Latest version of this software kept at
+ * http://drewnoakes.com/
+ *
+ * Created by Darren Salomons & Drew Noakes.
+ */
diff --git a/lib/metadata-extractor-2.4.0-beta-1.jar b/lib/metadata-extractor-2.4.0-beta-1.jar
new file mode 100644
index 000000000..3720d649d
Binary files /dev/null and b/lib/metadata-extractor-2.4.0-beta-1.jar differ
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index fab7e589f..7cec8e934 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -86,7 +86,6 @@ public final class TextParser {
private static final Set denyExtension = new TreeSet(insensitiveCollator);
static {
- initParser(new bmpParser());
initParser(new bzipParser());
initParser(new csvParser());
initParser(new docParser());
diff --git a/source/net/yacy/document/parser/images/bmpParser.java b/source/net/yacy/document/parser/images/bmpParser.java
index a7320ee6f..391fccdb9 100644
--- a/source/net/yacy/document/parser/images/bmpParser.java
+++ b/source/net/yacy/document/parser/images/bmpParser.java
@@ -25,27 +25,16 @@
package net.yacy.document.parser.images;
import java.awt.image.BufferedImage;
-import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
-import java.io.InputStream;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Set;
import javax.imageio.ImageIO;
-import net.yacy.document.AbstractParser;
-import net.yacy.document.Document;
-import net.yacy.document.Idiom;
-import net.yacy.document.ParserException;
-import net.yacy.document.parser.html.ImageEntry;
-import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
-public class bmpParser extends AbstractParser implements Idiom {
+public class bmpParser {
// this is a implementation of http://de.wikipedia.org/wiki/Windows_Bitmap
@@ -60,97 +49,12 @@ public class bmpParser extends AbstractParser implements Idiom {
//private static int BI_RLE4 = 2;
//private static int BI_BITFIELDS = 3;
- //boolean debugmode = false;
-
- public static final Set SUPPORTED_MIME_TYPES = new HashSet();
- public static final Set SUPPORTED_EXTENSIONS = new HashSet();
- static {
- SUPPORTED_EXTENSIONS.add("bmp");
- SUPPORTED_MIME_TYPES.add("image/bmp");
- }
-
- public bmpParser() {
- super("BMP Image Parser");
- }
-
- public Set supportedMimeTypes() {
- return SUPPORTED_MIME_TYPES;
- }
-
- public Set supportedExtensions() {
- return SUPPORTED_EXTENSIONS;
- }
public static final boolean isBMP(final byte[] source) {
// check the file magic
return (source != null) && (source.length >= 2) && (source[0] == 'B') && (source[1] == 'M');
}
- @Override
- public Document parse(
- final DigestURI location,
- final String mimeType,
- final String documentCharset,
- final InputStream sourceStream) throws ParserException, InterruptedException {
- BufferedImage image = null;
- try {
- image = ImageIO.read(sourceStream);
- } catch (final EOFException e) {
- Log.logException(e);
- throw new ParserException(e.getMessage(), location);
- } catch (final IOException e) {
- Log.logException(e);
- throw new ParserException(e.getMessage(), location);
- }
- if (image == null) throw new ParserException("ImageIO returned NULL", location);
-
- // scan the image
- int height = image.getHeight();
- int width = image.getWidth();
- /*
- Raster raster = image.getData();
- int[] pixel = raster.getPixel(0, 0, (int[])null);
- long[] average = new long[pixel.length];
- for (int i = 0; i < average.length; i++) average[i] = 0L;
- int pc = 0;
- for (int x = width / 4; x < 3 * width / 4; x = x + 2) {
- for (int y = height / 4; y < 3 * height / 4; y = y + 2) {
- pixel = raster.getPixel(x, y, pixel);
- for (int i = 0; i < average.length; i++) average[i] += pixel[i];
- pc++;
- }
- }
- */
- // get image properties
- String [] propNames = image.getPropertyNames();
- if (propNames == null) propNames = new String[0];
- StringBuilder sb = new StringBuilder(propNames.length * 80);
- for (String propName: propNames) {
- sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
- }
-
- final HashSet languages = new HashSet();
- final HashMap anchors = new HashMap();
- final HashMap images = new HashMap();
- // add this image to the map of images
- images.put(sb.toString(), new ImageEntry(location, "", width, height, -1));
-
- return new Document(
- location,
- mimeType,
- "UTF-8",
- languages,
- new String[]{}, // keywords
- "", // title
- "", // author
- new String[]{}, // sections
- "", // description
- sb.toString().getBytes(), // content text
- anchors, // anchors
- images,
- false); // images
- }
-
public static IMAGEMAP parse(final byte[] source) {
// read info-header
final int bfOffBits = DWORD(source, FILEHEADER_offset + 10);
diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java
index 83cec7fd5..f5f30947f 100644
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@@ -35,17 +35,30 @@ import java.io.IOException;
import java.io.InputStream;import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
+import java.util.Iterator;
import java.util.Set;
import javax.imageio.ImageIO;
+import com.drew.imaging.jpeg.JpegMetadataReader;
+import com.drew.metadata.Directory;
+import com.drew.metadata.Metadata;
+import com.drew.metadata.MetadataException;
+import com.drew.metadata.Tag;
+import com.sun.image.codec.jpeg.ImageFormatException;
+import com.sun.image.codec.jpeg.JPEGCodec;
+import com.sun.image.codec.jpeg.JPEGDecodeParam;
+import com.sun.image.codec.jpeg.JPEGImageDecoder;
+
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.document.parser.html.ImageEntry;
+import net.yacy.document.parser.images.bmpParser.IMAGEMAP;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.FileUtils;
public class genericImageParser extends AbstractParser implements Idiom {
@@ -61,9 +74,11 @@ public class genericImageParser extends AbstractParser implements Idiom {
SUPPORTED_EXTENSIONS.add("jpg");
SUPPORTED_EXTENSIONS.add("jpeg");
SUPPORTED_EXTENSIONS.add("jpe");
+ SUPPORTED_EXTENSIONS.add("bmp");
SUPPORTED_MIME_TYPES.add("image/png");
SUPPORTED_MIME_TYPES.add("image/gif");
SUPPORTED_MIME_TYPES.add("image/jpg");
+ SUPPORTED_MIME_TYPES.add("image/bmp");
}
public genericImageParser() {
@@ -76,6 +91,119 @@ public class genericImageParser extends AbstractParser implements Idiom {
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws ParserException, InterruptedException {
+
+ ImageInfo ii = null;
+ String title = null;
+ String author = null;
+ String keywords = null;
+ String description = null;
+ if (mimeType.equals("image/bmp") ||
+ location.getFileExtension().equals("bmp")) {
+ byte[] b;
+ try {
+ b = FileUtils.read(sourceStream);
+ } catch (IOException e) {
+ Log.logException(e);
+ throw new ParserException(e.getMessage(), location);
+ }
+ IMAGEMAP imap = bmpParser.parse(b);
+ ii = parseJavaImage(location, imap.getImage());
+ } else if (mimeType.equals("image/jpg") ||
+ location.getFileExtension().equals("jpg") ||
+ location.getFileExtension().equals("jpeg") ||
+ location.getFileExtension().equals("jpe")) {
+ // use the exif parser from
+ // http://www.drewnoakes.com/drewnoakes.com/code/exif/
+ // javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/
+ // a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html
+ JPEGImageDecoder jpegDecoder = JPEGCodec.createJPEGDecoder(sourceStream);
+ BufferedImage image = null;
+ try {
+ image = jpegDecoder.decodeAsBufferedImage();
+ } catch (ImageFormatException e) {
+ Log.logException(e);
+ throw new ParserException(e.getMessage(), location);
+ } catch (IOException e) {
+ Log.logException(e);
+ throw new ParserException(e.getMessage(), location);
+ }
+ JPEGDecodeParam decodeParam = jpegDecoder.getJPEGDecodeParam();
+ Metadata metadata = JpegMetadataReader.readMetadata(decodeParam);
+ ii = parseJavaImage(location, image);
+
+ Iterator directories = (Iterator) metadata.getDirectoryIterator();
+ HashMap props = new HashMap();
+ while (directories.hasNext()) {
+ Directory directory = directories.next();
+ Iterator tags = (Iterator) directory.getTagIterator();
+ while (tags.hasNext()) {
+ Tag tag = tags.next();
+ try {
+ props.put(tag.getTagName(), tag.getDescription());
+ ii.info.append(tag.getTagName() + ": " + tag.getDescription() + " .\n");
+ } catch (MetadataException e) {
+ Log.logException(e);
+ }
+ }
+ title = props.get("Image Description");
+ if (title == null || title.length() == 0) title = props.get("Headline");
+ if (title == null || title.length() == 0) title = props.get("Object Name");
+
+ author = props.get("Artist");
+ if (author == null || author.length() == 0) author = props.get("Writer/Editor");
+ if (author == null || author.length() == 0) author = props.get("By-line");
+ if (author == null || author.length() == 0) author = props.get("Credit");
+ if (author == null || author.length() == 0) author = props.get("Make");
+
+ keywords = props.get("Keywords");
+ if (keywords == null || keywords.length() == 0) keywords = props.get("Category");
+ if (keywords == null || keywords.length() == 0) keywords = props.get("Supplemental Category(s)");
+
+ description = props.get("Caption/Abstract");
+ if (description == null || description.length() == 0) description = props.get("Country/Primary Location");
+ if (description == null || description.length() == 0) description = props.get("Province/State");
+ if (description == null || description.length() == 0) description = props.get("Copyright Notice");
+ }
+ } else {
+ ii = parseJavaImage(location, sourceStream);
+ }
+
+ final HashSet languages = new HashSet();
+ final HashMap anchors = new HashMap();
+ final HashMap images = new HashMap();
+ // add this image to the map of images
+ String infoString = ii.info.toString();
+ images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1));
+
+ if (title == null) title = location.toNormalform(true, true);
+
+ return new Document(
+ location,
+ mimeType,
+ "UTF-8",
+ languages,
+ keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords
+ title, // title
+ author == null ? location.getHost() : author, // author
+ new String[]{}, // sections
+ description == null ? "" : description, // description
+ infoString.getBytes(), // content text
+ anchors, // anchors
+ images,
+ false); // images
+ }
+
+ public Set supportedMimeTypes() {
+ return SUPPORTED_MIME_TYPES;
+ }
+
+ public Set supportedExtensions() {
+ return SUPPORTED_EXTENSIONS;
+ }
+
+ public static ImageInfo parseJavaImage(
+ final DigestURI location,
+ final InputStream sourceStream) throws ParserException {
BufferedImage image = null;
try {
image = ImageIO.read(sourceStream);
@@ -87,10 +215,18 @@ public class genericImageParser extends AbstractParser implements Idiom {
throw new ParserException(e.getMessage(), location);
}
if (image == null) throw new ParserException("ImageIO returned NULL", location);
+ return parseJavaImage(location, image);
+ }
+
+ public static ImageInfo parseJavaImage(
+ final DigestURI location,
+ final BufferedImage image) throws ParserException {
+ ImageInfo ii = new ImageInfo(location);
+ ii.image = image;
// scan the image
- int height = image.getHeight();
- int width = image.getWidth();
+ ii.height = ii.image.getHeight();
+ ii.width = ii.image.getWidth();
/*
Raster raster = image.getData();
int[] pixel = raster.getPixel(0, 0, (int[])null);
@@ -106,53 +242,36 @@ public class genericImageParser extends AbstractParser implements Idiom {
}
*/
// get image properties
- String [] propNames = image.getPropertyNames();
+ String [] propNames = ii.image.getPropertyNames();
if (propNames == null) propNames = new String[0];
- StringBuilder sb = new StringBuilder(propNames.length * 80);
- sb.append("\n");
+ ii.info.append("\n");
for (String propName: propNames) {
- sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
+ ii.info.append(propName).append(" = ").append(ii.image.getProperty(propName)).append(" .\n");
}
// append also properties that we measured
- sb.append("width").append(" = ").append(Integer.toString(width)).append(" .\n");
- sb.append("height").append(" = ").append(Integer.toString(height)).append(" .\n");
+ ii.info.append("width").append(": ").append(Integer.toString(ii.width)).append(" .\n");
+ ii.info.append("height").append(": ").append(Integer.toString(ii.height)).append(" .\n");
- final HashSet languages = new HashSet();
- final HashMap anchors = new HashMap();
- final HashMap images = new HashMap();
- // add this image to the map of images
- images.put(sb.toString(), new ImageEntry(location, "", width, height, -1));
-
- return new Document(
- location,
- mimeType,
- "UTF-8",
- languages,
- new String[]{}, // keywords
- "", // title
- "", // author
- new String[]{}, // sections
- "", // description
- sb.toString().getBytes(), // content text
- anchors, // anchors
- images,
- false); // images
- }
-/*
- * Document(final DigestURI location, final String mimeType, final String charset, final Set languages,
- final String[] keywords, final String title, final String author,
- final String[] sections, final String abstrct,
- final Object text, final Map anchors, final HashMap images) {(non-Javadoc)
- * @see net.yacy.document.Idiom#supportedMimeTypes()
- */
- public Set supportedMimeTypes() {
- return SUPPORTED_MIME_TYPES;
+ return ii;
}
- public Set supportedExtensions() {
- return SUPPORTED_EXTENSIONS;
+ public static class ImageInfo {
+ public DigestURI location;
+ public BufferedImage image;
+ public StringBuilder info;
+ public int height;
+ public int width;
+ public ImageInfo(final DigestURI location) {
+ this.location = location;
+ this.image = null;
+ this.info = new StringBuilder();
+ this.height = -1;
+ this.width = -1;
+ }
}
+
+
public static void main(final String[] args) {
File image = new File(args[0]);
genericImageParser parser = new genericImageParser();