From 56e0d9bd01737993f231389fc06a08d4bf370b7b Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 19 Jan 2010 14:59:58 +0000 Subject: [PATCH] - testings with image parser - added image size as part of parsed text in images - avoid unnecessary error messages if parsing of documents failed but one succeeded git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6597 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/net/yacy/document/AbstractParser.java | 8 ++++ source/net/yacy/document/Document.java | 38 ++++++++++++++----- source/net/yacy/document/Idiom.java | 5 +++ source/net/yacy/document/TextParser.java | 20 ++++++++-- .../parser/images/genericImageParser.java | 29 +++++++++++++- 5 files changed, 85 insertions(+), 15 deletions(-) diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index 6f98c7811..125732125 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -232,4 +232,12 @@ public abstract class AbstractParser implements Idiom { public void reset() { this.contentLength = -1; } + + public boolean equals(Object o) { + return this.getName().equals(((Idiom) o).getName()); + } + + public int hashCode() { + return this.getName().hashCode(); + } } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 911429fac..a18753146 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -25,6 +25,7 @@ package net.yacy.document; import java.io.BufferedInputStream; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; @@ -561,21 +562,38 @@ dc_rights public void writeXML(OutputStreamWriter os, Date date) throws IOException { os.write("\n"); - os.write("\n"); + String title = this.dc_title(); + if (title != null && title.length() > 0) os.write("\n"); os.write("" + this.dc_identifier() + "\n"); - os.write("\n"); - os.write(" 0) os.write(new String(buffer, 0, c)); - is.close(); - os.write("]]>\n"); - os.write("" + this.dc_language() + "\n"); + String creator = this.dc_creator(); + if (creator != null && creator.length() > 0) os.write("\n"); + if (this.text != null) { + os.write(" 0) os.write(new String(buffer, 0, c)); + is.close(); + os.write("]]>\n"); + } + String language = this.dc_language(); + if (language != null && language.length() > 0) os.write("" + this.dc_language() + "\n"); os.write("" + DateFormatter.formatISO8601(date) + "\n"); os.write("\n"); } + public String toString() { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + OutputStreamWriter osw = new OutputStreamWriter(baos); + try { + writeXML(osw, new Date()); + osw.close(); + return new String(baos.toByteArray(), "UTF-8"); + } catch (IOException e) { + return ""; + } + } + public void close() { // try close the output stream if (this.textStream != null) { diff --git a/source/net/yacy/document/Idiom.java b/source/net/yacy/document/Idiom.java index 37f99d6a6..22afd1e34 100644 --- a/source/net/yacy/document/Idiom.java +++ b/source/net/yacy/document/Idiom.java @@ -109,6 +109,11 @@ public interface Idiom { * @return parser name */ public String getName(); + + public boolean equals(Object o); + + public int hashCode(); + } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index bb5e7737c..53de21c44 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -33,6 +33,7 @@ import java.io.FileInputStream; import java.io.InputStream; import java.text.Collator; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Locale; @@ -224,20 +225,31 @@ public final class TextParser { if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); Document doc = null; + HashMap failedParser = new HashMap(); for (Idiom parser: idioms) { parser.setContentLength(contentLength); try { doc = parser.parse(location, mimeType, documentCharset, sourceStream); } catch (ParserException e) { - log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); + failedParser.put(parser, e); + //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } if (doc != null) break; } if (doc == null) { - final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; - log.logWarning("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location); + if (failedParser.size() == 0) { + final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; + //log.logWarning("Unable to parse '" + location + "'. " + errorMsg); + throw new ParserException(errorMsg, location); + } else { + String failedParsers = ""; + for (Map.Entry error: failedParser.entrySet()) { + log.logWarning("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + error.getValue().getMessage(), error.getValue()); + failedParsers += error.getKey().getName() + " "; + } + throw new ParserException("All parser failed: " + failedParsers, location); + } } return doc; } diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 0a0214967..13bb85f66 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -28,8 +28,12 @@ package net.yacy.document.parser.images; import java.awt.image.BufferedImage; import java.io.EOFException; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream;import java.util.HashMap; +import java.io.InputStream;import java.net.MalformedURLException; +import java.util.HashMap; import java.util.HashSet; import java.util.Set; @@ -105,9 +109,13 @@ public class genericImageParser extends AbstractParser implements Idiom { String [] propNames = image.getPropertyNames(); if (propNames == null) propNames = new String[0]; StringBuilder sb = new StringBuilder(propNames.length * 80); + sb.append("\n"); for (String propName: propNames) { sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n"); } + // append also properties that we measured + sb.append("width").append(" = ").append(Integer.toString(width)).append(" .\n"); + sb.append("height").append(" = ").append(Integer.toString(height)).append(" .\n"); final HashSet languages = new HashSet(); final HashMap anchors = new HashMap(); @@ -144,4 +152,23 @@ public class genericImageParser extends AbstractParser implements Idiom { return SUPPORTED_EXTENSIONS; } + public static void main(final String[] args) { + File image = new File(args[0]); + genericImageParser parser = new genericImageParser(); + DigestURI uri; + try { + uri = new DigestURI("http://localhost/" + image.getName()); + Document document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image)); + System.out.println(document.toString()); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (ParserException e) { + e.printStackTrace(); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + }