- enhanced logging and exception details for parsers

- removed inconsistencies in mime type declaration (one mime type should only appear once in all parsers)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6192 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 4b74ad0a46
commit caedd72400

@ -81,7 +81,7 @@ public final class Parser {
initParser(new docParser());
initParser(new gzipParser());
initParser(new htmlParser());
initParser(new mimeTypeParser());
//initParser(new mimeTypeParser()); // what does that thing do?
initParser(new odtParser());
initParser(new pdfParser());
initParser(new pptParser());
@ -107,9 +107,9 @@ public final class Parser {
private static void initParser(Idiom parser) {
for (Map.Entry<String, String> e: parser.getSupportedMimeTypes().entrySet()) {
// process the mime types
final String mimeType = e.getKey();
final String mimeType = normalizeMimeType(e.getKey());
Idiom p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser.");
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
@ -134,7 +134,7 @@ public final class Parser {
if (sourceArray == null || sourceArray.length == 0) {
final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false));
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, errorMsg);
throw new ParserException(errorMsg, location);
}
byteIn = new ByteArrayInputStream(sourceArray);
return parseSource(location, mimeType, charset, sourceArray.length, byteIn);
@ -142,7 +142,7 @@ public final class Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
throw new ParserException("Unexpected exception: " + e.getMessage(), location);
} finally {
if (byteIn != null) try {
byteIn.close();
@ -160,7 +160,7 @@ public final class Parser {
if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2).";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "document has no content");
throw new ParserException(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
@ -168,7 +168,7 @@ public final class Parser {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new ParserException("Unexpected exception while parsing " + location, location, e);
throw new ParserException("Unexpected exception: " + e.getMessage(), location);
} finally {
if (sourceStream != null)try {
sourceStream.close();
@ -188,12 +188,12 @@ public final class Parser {
if (!supportsMime(mimeType)) {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type");
throw new ParserException(errorMsg, location);
}
if (!supportsExtension(location)) {
final String errorMsg = "No parser available to parse extension of url path";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong extension");
throw new ParserException(errorMsg, location);
}
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
Idiom parser = mime2parser.get(normalizeMimeType(mimeType));
@ -204,7 +204,7 @@ public final class Parser {
} else {
final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)";
log.logInfo("Unable to parse '" + location + "'. " + errorMsg);
throw new ParserException(errorMsg, location, "wrong mime type or wrong extension");
throw new ParserException(errorMsg, location);
}
if (doc == null) {
final String errorMsg = "Unexpected error. Parser returned null.";
@ -217,11 +217,12 @@ public final class Parser {
if (e instanceof ParserException) throw (ParserException) e;
final String errorMsg = "Unexpected exception. " + e.getMessage();
log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e);
throw new ParserException(errorMsg, location, e);
throw new ParserException(errorMsg, location);
}
}
public static boolean supportsMime(String mimeType) {
mimeType = normalizeMimeType(mimeType);
return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType));
}
@ -249,7 +250,7 @@ public final class Parser {
public static void setDenyMime(String denyList) {
denyMime.clear();
for (String s: denyList.split(",")) denyMime.add(s);
for (String s: denyList.split(",")) denyMime.add(normalizeMimeType(s));
}
public static String getDenyMime() {
@ -260,6 +261,6 @@ public final class Parser {
}
public static void grantMime(String mime, boolean grant) {
if (grant) denyMime.remove(mime); else denyMime.add(mime);
if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime));
}
}

@ -26,9 +26,7 @@ package de.anomic.document;
import de.anomic.yacy.yacyURL;
public class ParserException extends Exception
{
private String errorCode = null;
public class ParserException extends Exception {
private yacyURL url = null;
private static final long serialVersionUID = 1L;
@ -38,28 +36,9 @@ public class ParserException extends Exception
}
public ParserException(final String message, final yacyURL url) {
this(message,url, "parser error for url " + url.toString());
}
public ParserException(final String message, final yacyURL url, final String errorCode) {
super(message);
this.errorCode = errorCode;
this.url = url;
}
public ParserException(final String message, final yacyURL url, final Throwable cause) {
this(message,url,cause, "parser error for url " + url.toString());
}
public ParserException(final String message, final yacyURL url, final Throwable cause, final String errorCode) {
super(message, cause);
this.errorCode = errorCode;
super(message + "; url = " + url.toNormalform(true, false));
this.url = url;
}
public String getErrorCode() {
return this.errorCode;
}
public yacyURL getURL() {
return this.url;

@ -52,10 +52,8 @@ public class bzipParser extends AbstractParser implements Idiom {
static {
SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions);
SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions);
}

@ -27,7 +27,9 @@
package de.anomic.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
@ -62,23 +64,31 @@ public class docParser extends AbstractParser implements Idiom {
super("Word Document Parser");
}
public Document parse(final yacyURL location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {
try {
final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory();
final TextExtractor extractor = extractorFactory.textExtractor(source);
final String contents = extractor.getText().trim();
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
final Document theDoc = new Document(
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory();
TextExtractor extractor = null;
try {
extractor = extractorFactory.textExtractor(source);
} catch (Exception e) {
throw new ParserException("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
}
String contents = null;
try {
contents = extractor.getText().trim();
} catch (IOException e) {
throw new ParserException("error in docParser, getText: " + e.getMessage(), location);
}
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80);
int l = title.length();
while (true) {
title = title.replaceAll(" ", " ");
if (title.length() == l) break;
l = title.length();
}
Document theDoc;
try {
theDoc = new Document(
location,
mimeType,
"UTF-8",
@ -91,15 +101,11 @@ public class docParser extends AbstractParser implements Idiom {
contents.getBytes("UTF-8"),
null,
null);
return theDoc;
} catch (final Exception e) {
e.printStackTrace();
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location);
}
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in docParser, getBytes: " + e.getMessage(), location);
}
return theDoc;
}
public HashMap<String, String> getSupportedMimeTypes() {

@ -59,7 +59,6 @@ public class gzipParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("application/x-compress",ext);
SUPPORTED_MIME_TYPES.put("gzip/document",ext);
SUPPORTED_MIME_TYPES.put("application/octet-stream",ext);
SUPPORTED_MIME_TYPES.put("application/x-tar",ext);
}
public gzipParser() {

@ -50,7 +50,7 @@ public class htmlParser extends AbstractParser implements Idiom {
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp";
String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv";
SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext);
SUPPORTED_MIME_TYPES.put("text/html", ext);
SUPPORTED_MIME_TYPES.put("text/plain", ext);

@ -60,9 +60,6 @@ public class mimeTypeParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("text/xml","xml");
SUPPORTED_MIME_TYPES.put("application/xml","xml");
SUPPORTED_MIME_TYPES.put("application/x-xml","xml");
SUPPORTED_MIME_TYPES.put("application/octet-stream","xml");
SUPPORTED_MIME_TYPES.put("application/x-compress","xml");
SUPPORTED_MIME_TYPES.put("application/x-compressed","xml");
}
/**

@ -65,8 +65,8 @@ public class odtParser extends AbstractParser implements Idiom {
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static {
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt");
SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp");
SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp");
}
public odtParser() {

@ -107,7 +107,7 @@ public class pdfParser extends AbstractParser implements Idiom {
theDocument.openProtection(new StandardDecryptionMaterial(""));
final AccessPermission perm = theDocument.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new ParserException("Document is encrypted",location, "document is exncrypted");
throw new ParserException("Document is encrypted", location);
}
// extracting some metadata

@ -45,7 +45,7 @@ public class pptParser extends AbstractParser implements Idiom {
* @see #getSupportedMimeTypes()
*/
public static final HashMap<String, String> SUPPORTED_MIME_TYPES = new HashMap<String, String>();
static final String ext = "ppt,pps";
static final String ext = "ppt,pptx,pps";
static {
SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext);
SUPPORTED_MIME_TYPES.put("application/powerpoint",ext);

@ -282,7 +282,7 @@ public class psParser extends AbstractParser implements Idiom {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof ParserException) throw (ParserException) e;
throw new ParserException("Unable to parse the ps file. " + e.getMessage(),location, e);
throw new ParserException("Unable to parse the ps file. " + e.getMessage(), location);
} finally {
if (tempFile != null) FileUtils.deletedelete(tempFile);
}

@ -50,8 +50,6 @@ public class rtfParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("text/rtf","rtf");
SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf");
SUPPORTED_MIME_TYPES.put("text/richtext","rtf");
SUPPORTED_MIME_TYPES.put("application/msword","rtf");
SUPPORTED_MIME_TYPES.put("application/doc","rtf");
SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf");
}

@ -72,7 +72,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
try {
archive = new Handler(source);
} catch (final IOException e) {
throw new ParserException("error opening 7zip archive", location, e);
throw new ParserException("error opening 7zip archive: " + e.getMessage(), location);
}
checkInterruption();
final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
@ -87,8 +87,8 @@ public class sevenzipParser extends AbstractParser implements Idiom {
if (e.getCause() instanceof ParserException)
throw (ParserException)e.getCause();
throw new ParserException(
"error processing 7zip archive at internal file: " + aec.getCurrentFilePath(),
location, e);
"error processing 7zip archive at internal file " + aec.getCurrentFilePath() + ": " + e.getMessage(),
location);
} finally {
try { archive.close(); } catch (final IOException e) { }
}
@ -106,7 +106,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
}
@ -120,7 +120,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
return parse(location, mimeType, charset, cfos.getContentBAOS());
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive", location, e);
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
}

@ -64,8 +64,6 @@ public class tarParser extends AbstractParser implements Idiom {
SUPPORTED_MIME_TYPES.put("application/tar","tar");
SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar");
SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar");
SUPPORTED_MIME_TYPES.put("application/x-compress","tar");
SUPPORTED_MIME_TYPES.put("application/x-compressed","tar");
}
public tarParser() {

@ -1664,7 +1664,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
private Document parseDocument(final IndexingStack.QueueEntry entry) throws InterruptedException {
Document document = null;
boolean parserException = false;
final int processCase = entry.processCase();
if (this.log.isFine()) log.logFine("processResourceStack processCase=" + processCase +
@ -1684,20 +1683,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
document = Parser.parseSource(entry.url(), entry.getMimeType(), entry.getCharacterEncoding(), plasmaHTCache.getResourceContent(entry.url()));
assert(document != null) : "Unexpected error. Parser returned null.";
} catch (final ParserException e) {
parserException = true;
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + e.getMessage());
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), e.getErrorCode());
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), e.getMessage());
if (document != null) {
document.close();
document = null;
}
return null;
} finally {
if (document == null && !parserException) { // if you get here, comment this part out and you will possibly see a OOM in the log
this.log.logWarning("Unable to parse the resource '" + entry.url() + "'. " + "no parser result");
addURLtoErrorDB(entry.url(), entry.referrerHash(), entry.initiator(), entry.anchorName(), "no parser result");
return null;
}
}
final long parsingEndTime = System.currentTimeMillis();

Loading…
Cancel
Save