diff --git a/.classpath b/.classpath index f8d0809a5..5094c4b77 100644 --- a/.classpath +++ b/.classpath @@ -1,42 +1,43 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/lib/metadata-extractor-2.4.0-beta-1.License b/lib/metadata-extractor-2.4.0-beta-1.License new file mode 100644 index 000000000..15b62f428 --- /dev/null +++ b/lib/metadata-extractor-2.4.0-beta-1.License @@ -0,0 +1,16 @@ +/* + * This is public domain software - that is, you can do whatever you want + * with it, and include it software that is licensed under the GNU or the + * BSD license, or whatever other licence you choose, including proprietary + * closed source licenses. I do ask that you leave this header in tact. + * + * If you make modifications to this code that you think would benefit the + * wider community, please send me a copy and I'll post it on my site. + * + * If you make use of this code, I'd appreciate hearing about it. + * metadata_extractor [at] drewnoakes [dot] com + * Latest version of this software kept at + * http://drewnoakes.com/ + * + * Created by Darren Salomons & Drew Noakes. + */ diff --git a/lib/metadata-extractor-2.4.0-beta-1.jar b/lib/metadata-extractor-2.4.0-beta-1.jar new file mode 100644 index 000000000..3720d649d Binary files /dev/null and b/lib/metadata-extractor-2.4.0-beta-1.jar differ diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index fab7e589f..7cec8e934 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -86,7 +86,6 @@ public final class TextParser { private static final Set denyExtension = new TreeSet(insensitiveCollator); static { - initParser(new bmpParser()); initParser(new bzipParser()); initParser(new csvParser()); initParser(new docParser()); diff --git a/source/net/yacy/document/parser/images/bmpParser.java b/source/net/yacy/document/parser/images/bmpParser.java index a7320ee6f..391fccdb9 100644 --- a/source/net/yacy/document/parser/images/bmpParser.java +++ b/source/net/yacy/document/parser/images/bmpParser.java @@ -25,27 +25,16 @@ package net.yacy.document.parser.images; import java.awt.image.BufferedImage; -import java.io.EOFException; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; import javax.imageio.ImageIO; -import net.yacy.document.AbstractParser; -import net.yacy.document.Document; -import net.yacy.document.Idiom; -import net.yacy.document.ParserException; -import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; -public class bmpParser extends AbstractParser implements Idiom { +public class bmpParser { // this is a implementation of http://de.wikipedia.org/wiki/Windows_Bitmap @@ -60,97 +49,12 @@ public class bmpParser extends AbstractParser implements Idiom { //private static int BI_RLE4 = 2; //private static int BI_BITFIELDS = 3; - //boolean debugmode = false; - - public static final Set SUPPORTED_MIME_TYPES = new HashSet(); - public static final Set SUPPORTED_EXTENSIONS = new HashSet(); - static { - SUPPORTED_EXTENSIONS.add("bmp"); - SUPPORTED_MIME_TYPES.add("image/bmp"); - } - - public bmpParser() { - super("BMP Image Parser"); - } - - public Set supportedMimeTypes() { - return SUPPORTED_MIME_TYPES; - } - - public Set supportedExtensions() { - return SUPPORTED_EXTENSIONS; - } public static final boolean isBMP(final byte[] source) { // check the file magic return (source != null) && (source.length >= 2) && (source[0] == 'B') && (source[1] == 'M'); } - @Override - public Document parse( - final DigestURI location, - final String mimeType, - final String documentCharset, - final InputStream sourceStream) throws ParserException, InterruptedException { - BufferedImage image = null; - try { - image = ImageIO.read(sourceStream); - } catch (final EOFException e) { - Log.logException(e); - throw new ParserException(e.getMessage(), location); - } catch (final IOException e) { - Log.logException(e); - throw new ParserException(e.getMessage(), location); - } - if (image == null) throw new ParserException("ImageIO returned NULL", location); - - // scan the image - int height = image.getHeight(); - int width = image.getWidth(); - /* - Raster raster = image.getData(); - int[] pixel = raster.getPixel(0, 0, (int[])null); - long[] average = new long[pixel.length]; - for (int i = 0; i < average.length; i++) average[i] = 0L; - int pc = 0; - for (int x = width / 4; x < 3 * width / 4; x = x + 2) { - for (int y = height / 4; y < 3 * height / 4; y = y + 2) { - pixel = raster.getPixel(x, y, pixel); - for (int i = 0; i < average.length; i++) average[i] += pixel[i]; - pc++; - } - } - */ - // get image properties - String [] propNames = image.getPropertyNames(); - if (propNames == null) propNames = new String[0]; - StringBuilder sb = new StringBuilder(propNames.length * 80); - for (String propName: propNames) { - sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n"); - } - - final HashSet languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); - // add this image to the map of images - images.put(sb.toString(), new ImageEntry(location, "", width, height, -1)); - - return new Document( - location, - mimeType, - "UTF-8", - languages, - new String[]{}, // keywords - "", // title - "", // author - new String[]{}, // sections - "", // description - sb.toString().getBytes(), // content text - anchors, // anchors - images, - false); // images - } - public static IMAGEMAP parse(final byte[] source) { // read info-header final int bfOffBits = DWORD(source, FILEHEADER_offset + 10); diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 83cec7fd5..f5f30947f 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -35,17 +35,30 @@ import java.io.IOException; import java.io.InputStream;import java.net.MalformedURLException; import java.util.HashMap; import java.util.HashSet; +import java.util.Iterator; import java.util.Set; import javax.imageio.ImageIO; +import com.drew.imaging.jpeg.JpegMetadataReader; +import com.drew.metadata.Directory; +import com.drew.metadata.Metadata; +import com.drew.metadata.MetadataException; +import com.drew.metadata.Tag; +import com.sun.image.codec.jpeg.ImageFormatException; +import com.sun.image.codec.jpeg.JPEGCodec; +import com.sun.image.codec.jpeg.JPEGDecodeParam; +import com.sun.image.codec.jpeg.JPEGImageDecoder; + import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Idiom; import net.yacy.document.ParserException; import net.yacy.document.parser.html.ImageEntry; +import net.yacy.document.parser.images.bmpParser.IMAGEMAP; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.FileUtils; public class genericImageParser extends AbstractParser implements Idiom { @@ -61,9 +74,11 @@ public class genericImageParser extends AbstractParser implements Idiom { SUPPORTED_EXTENSIONS.add("jpg"); SUPPORTED_EXTENSIONS.add("jpeg"); SUPPORTED_EXTENSIONS.add("jpe"); + SUPPORTED_EXTENSIONS.add("bmp"); SUPPORTED_MIME_TYPES.add("image/png"); SUPPORTED_MIME_TYPES.add("image/gif"); SUPPORTED_MIME_TYPES.add("image/jpg"); + SUPPORTED_MIME_TYPES.add("image/bmp"); } public genericImageParser() { @@ -76,6 +91,119 @@ public class genericImageParser extends AbstractParser implements Idiom { final String mimeType, final String documentCharset, final InputStream sourceStream) throws ParserException, InterruptedException { + + ImageInfo ii = null; + String title = null; + String author = null; + String keywords = null; + String description = null; + if (mimeType.equals("image/bmp") || + location.getFileExtension().equals("bmp")) { + byte[] b; + try { + b = FileUtils.read(sourceStream); + } catch (IOException e) { + Log.logException(e); + throw new ParserException(e.getMessage(), location); + } + IMAGEMAP imap = bmpParser.parse(b); + ii = parseJavaImage(location, imap.getImage()); + } else if (mimeType.equals("image/jpg") || + location.getFileExtension().equals("jpg") || + location.getFileExtension().equals("jpeg") || + location.getFileExtension().equals("jpe")) { + // use the exif parser from + // http://www.drewnoakes.com/drewnoakes.com/code/exif/ + // javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/ + // a tutorial is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/sampleUsage.html + JPEGImageDecoder jpegDecoder = JPEGCodec.createJPEGDecoder(sourceStream); + BufferedImage image = null; + try { + image = jpegDecoder.decodeAsBufferedImage(); + } catch (ImageFormatException e) { + Log.logException(e); + throw new ParserException(e.getMessage(), location); + } catch (IOException e) { + Log.logException(e); + throw new ParserException(e.getMessage(), location); + } + JPEGDecodeParam decodeParam = jpegDecoder.getJPEGDecodeParam(); + Metadata metadata = JpegMetadataReader.readMetadata(decodeParam); + ii = parseJavaImage(location, image); + + Iterator directories = (Iterator) metadata.getDirectoryIterator(); + HashMap props = new HashMap(); + while (directories.hasNext()) { + Directory directory = directories.next(); + Iterator tags = (Iterator) directory.getTagIterator(); + while (tags.hasNext()) { + Tag tag = tags.next(); + try { + props.put(tag.getTagName(), tag.getDescription()); + ii.info.append(tag.getTagName() + ": " + tag.getDescription() + " .\n"); + } catch (MetadataException e) { + Log.logException(e); + } + } + title = props.get("Image Description"); + if (title == null || title.length() == 0) title = props.get("Headline"); + if (title == null || title.length() == 0) title = props.get("Object Name"); + + author = props.get("Artist"); + if (author == null || author.length() == 0) author = props.get("Writer/Editor"); + if (author == null || author.length() == 0) author = props.get("By-line"); + if (author == null || author.length() == 0) author = props.get("Credit"); + if (author == null || author.length() == 0) author = props.get("Make"); + + keywords = props.get("Keywords"); + if (keywords == null || keywords.length() == 0) keywords = props.get("Category"); + if (keywords == null || keywords.length() == 0) keywords = props.get("Supplemental Category(s)"); + + description = props.get("Caption/Abstract"); + if (description == null || description.length() == 0) description = props.get("Country/Primary Location"); + if (description == null || description.length() == 0) description = props.get("Province/State"); + if (description == null || description.length() == 0) description = props.get("Copyright Notice"); + } + } else { + ii = parseJavaImage(location, sourceStream); + } + + final HashSet languages = new HashSet(); + final HashMap anchors = new HashMap(); + final HashMap images = new HashMap(); + // add this image to the map of images + String infoString = ii.info.toString(); + images.put(infoString, new ImageEntry(location, "", ii.width, ii.height, -1)); + + if (title == null) title = location.toNormalform(true, true); + + return new Document( + location, + mimeType, + "UTF-8", + languages, + keywords == null ? new String[]{} : keywords.split(keywords.indexOf(',') > 0 ? "," : " "), // keywords + title, // title + author == null ? location.getHost() : author, // author + new String[]{}, // sections + description == null ? "" : description, // description + infoString.getBytes(), // content text + anchors, // anchors + images, + false); // images + } + + public Set supportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + + public static ImageInfo parseJavaImage( + final DigestURI location, + final InputStream sourceStream) throws ParserException { BufferedImage image = null; try { image = ImageIO.read(sourceStream); @@ -87,10 +215,18 @@ public class genericImageParser extends AbstractParser implements Idiom { throw new ParserException(e.getMessage(), location); } if (image == null) throw new ParserException("ImageIO returned NULL", location); + return parseJavaImage(location, image); + } + + public static ImageInfo parseJavaImage( + final DigestURI location, + final BufferedImage image) throws ParserException { + ImageInfo ii = new ImageInfo(location); + ii.image = image; // scan the image - int height = image.getHeight(); - int width = image.getWidth(); + ii.height = ii.image.getHeight(); + ii.width = ii.image.getWidth(); /* Raster raster = image.getData(); int[] pixel = raster.getPixel(0, 0, (int[])null); @@ -106,53 +242,36 @@ public class genericImageParser extends AbstractParser implements Idiom { } */ // get image properties - String [] propNames = image.getPropertyNames(); + String [] propNames = ii.image.getPropertyNames(); if (propNames == null) propNames = new String[0]; - StringBuilder sb = new StringBuilder(propNames.length * 80); - sb.append("\n"); + ii.info.append("\n"); for (String propName: propNames) { - sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n"); + ii.info.append(propName).append(" = ").append(ii.image.getProperty(propName)).append(" .\n"); } // append also properties that we measured - sb.append("width").append(" = ").append(Integer.toString(width)).append(" .\n"); - sb.append("height").append(" = ").append(Integer.toString(height)).append(" .\n"); + ii.info.append("width").append(": ").append(Integer.toString(ii.width)).append(" .\n"); + ii.info.append("height").append(": ").append(Integer.toString(ii.height)).append(" .\n"); - final HashSet languages = new HashSet(); - final HashMap anchors = new HashMap(); - final HashMap images = new HashMap(); - // add this image to the map of images - images.put(sb.toString(), new ImageEntry(location, "", width, height, -1)); - - return new Document( - location, - mimeType, - "UTF-8", - languages, - new String[]{}, // keywords - "", // title - "", // author - new String[]{}, // sections - "", // description - sb.toString().getBytes(), // content text - anchors, // anchors - images, - false); // images - } -/* - * Document(final DigestURI location, final String mimeType, final String charset, final Set languages, - final String[] keywords, final String title, final String author, - final String[] sections, final String abstrct, - final Object text, final Map anchors, final HashMap images) {(non-Javadoc) - * @see net.yacy.document.Idiom#supportedMimeTypes() - */ - public Set supportedMimeTypes() { - return SUPPORTED_MIME_TYPES; + return ii; } - public Set supportedExtensions() { - return SUPPORTED_EXTENSIONS; + public static class ImageInfo { + public DigestURI location; + public BufferedImage image; + public StringBuilder info; + public int height; + public int width; + public ImageInfo(final DigestURI location) { + this.location = location; + this.image = null; + this.info = new StringBuilder(); + this.height = -1; + this.width = -1; + } } + + public static void main(final String[] args) { File image = new File(args[0]); genericImageParser parser = new genericImageParser();