- testings with image parser

- added image size as part of parsed text in images
- avoid unnecessary error messages if parsing of documents failed but one succeeded


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6597 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent e80e060ca6
commit 56e0d9bd01

@ -232,4 +232,12 @@ public abstract class AbstractParser implements Idiom {
public void reset() {
this.contentLength = -1;
}
public boolean equals(Object o) {
return this.getName().equals(((Idiom) o).getName());
}
public int hashCode() {
return this.getName().hashCode();
}
}

@ -25,6 +25,7 @@ package net.yacy.document;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
@ -561,21 +562,38 @@ dc_rights
public void writeXML(OutputStreamWriter os, Date date) throws IOException {
os.write("<record>\n");
os.write("<dc:Title><![CDATA[" + this.dc_title() + "]]></dc:Title>\n");
String title = this.dc_title();
if (title != null && title.length() > 0) os.write("<dc:Title><![CDATA[" + this.dc_title() + "]]></dc:Title>\n");
os.write("<dc:Identifier>" + this.dc_identifier() + "</dc:Identifier>\n");
os.write("<dc:Creator><![CDATA[" + this.dc_creator() + "]]></dc:Creator>\n");
os.write("<dc:Description><![CDATA[");
byte[] buffer = new byte[1000];
int c = 0;
InputStream is = this.getText();
while ((c = is.read(buffer)) > 0) os.write(new String(buffer, 0, c));
is.close();
os.write("]]></dc:Description>\n");
os.write("<dc:Language>" + this.dc_language() + "</dc:Language>\n");
String creator = this.dc_creator();
if (creator != null && creator.length() > 0) os.write("<dc:Creator><![CDATA[" + this.dc_creator() + "]]></dc:Creator>\n");
if (this.text != null) {
os.write("<dc:Description><![CDATA[");
byte[] buffer = new byte[1000];
int c = 0;
InputStream is = this.getText();
while ((c = is.read(buffer)) > 0) os.write(new String(buffer, 0, c));
is.close();
os.write("]]></dc:Description>\n");
}
String language = this.dc_language();
if (language != null && language.length() > 0) os.write("<dc:Language>" + this.dc_language() + "</dc:Language>\n");
os.write("<dc:Date>" + DateFormatter.formatISO8601(date) + "</dc:Date>\n");
os.write("</record>\n");
}
public String toString() {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
OutputStreamWriter osw = new OutputStreamWriter(baos);
try {
writeXML(osw, new Date());
osw.close();
return new String(baos.toByteArray(), "UTF-8");
} catch (IOException e) {
return "";
}
}
public void close() {
// try close the output stream
if (this.textStream != null) {

@ -109,6 +109,11 @@ public interface Idiom {
* @return parser name
*/
public String getName();
public boolean equals(Object o);
public int hashCode();
}

@ -33,6 +33,7 @@ import java.io.FileInputStream;
import java.io.InputStream;
import java.text.Collator;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
@ -224,20 +225,31 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Document doc = null;
HashMap<Idiom, ParserException> failedParser = new HashMap<Idiom, ParserException>();
for (Idiom parser: idioms) {
parser.setContentLength(contentLength);
try {
doc = parser.parse(location, mimeType, documentCharset, sourceStream);
} catch (ParserException e) {
log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
}
if (doc != null) break;
}
if (doc == null) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
if (failedParser.size() == 0) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location);
} else {
String failedParsers = "";
for (Map.Entry<Idiom, ParserException> error: failedParser.entrySet()) {
log.logWarning("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + error.getValue().getMessage(), error.getValue());
failedParsers += error.getKey().getName() + " ";
}
throw new ParserException("All parser failed: " + failedParsers, location);
}
}
return doc;
}

@ -28,8 +28,12 @@ package net.yacy.document.parser.images;
import java.awt.image.BufferedImage;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;import java.util.HashMap;
import java.io.InputStream;import java.net.MalformedURLException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
@ -105,9 +109,13 @@ public class genericImageParser extends AbstractParser implements Idiom {
String [] propNames = image.getPropertyNames();
if (propNames == null) propNames = new String[0];
StringBuilder sb = new StringBuilder(propNames.length * 80);
sb.append("\n");
for (String propName: propNames) {
sb.append(propName).append(" = ").append(image.getProperty(propName)).append(" .\n");
}
// append also properties that we measured
sb.append("width").append(" = ").append(Integer.toString(width)).append(" .\n");
sb.append("height").append(" = ").append(Integer.toString(height)).append(" .\n");
final HashSet<String> languages = new HashSet<String>();
final HashMap<DigestURI, String> anchors = new HashMap<DigestURI, String>();
@ -144,4 +152,23 @@ public class genericImageParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public static void main(final String[] args) {
File image = new File(args[0]);
genericImageParser parser = new genericImageParser();
DigestURI uri;
try {
uri = new DigestURI("http://localhost/" + image.getName());
Document document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
System.out.println(document.toString());
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (ParserException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}

Loading…
Cancel
Save