diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java index 5baf9eecf..4c739b6e1 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/de/anomic/document/Parser.java @@ -81,7 +81,7 @@ public final class Parser { initParser(new docParser()); initParser(new gzipParser()); initParser(new htmlParser()); - initParser(new mimeTypeParser()); + //initParser(new mimeTypeParser()); // what does that thing do? initParser(new odtParser()); initParser(new pdfParser()); initParser(new pptParser()); @@ -107,9 +107,9 @@ public final class Parser { private static void initParser(Idiom parser) { for (Map.Entry e: parser.getSupportedMimeTypes().entrySet()) { // process the mime types - final String mimeType = e.getKey(); + final String mimeType = normalizeMimeType(e.getKey()); Idiom p0 = mime2parser.get(mimeType); - if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser."); + if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); mime2parser.put(mimeType, parser); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); @@ -134,7 +134,7 @@ public final class Parser { if (sourceArray == null || sourceArray.length == 0) { final String errorMsg = "No resource content available (1) " + (((sourceArray == null) ? "source == null" : "source.length() == 0") + ", url = " + location.toNormalform(true, false)); log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, errorMsg); + throw new ParserException(errorMsg, location); } byteIn = new ByteArrayInputStream(sourceArray); return parseSource(location, mimeType, charset, sourceArray.length, byteIn); @@ -142,7 +142,7 @@ public final class Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; log.logSevere("Unexpected exception in parseSource from byte-array: " + e.getMessage(), e); - throw new ParserException("Unexpected exception while parsing " + location, location, e); + throw new ParserException("Unexpected exception: " + e.getMessage(), location); } finally { if (byteIn != null) try { byteIn.close(); @@ -160,7 +160,7 @@ public final class Parser { if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) { final String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available (2)."; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, "document has no content"); + throw new ParserException(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); @@ -168,7 +168,7 @@ public final class Parser { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); - throw new ParserException("Unexpected exception while parsing " + location, location, e); + throw new ParserException("Unexpected exception: " + e.getMessage(), location); } finally { if (sourceStream != null)try { sourceStream.close(); @@ -188,12 +188,12 @@ public final class Parser { if (!supportsMime(mimeType)) { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, "wrong mime type"); + throw new ParserException(errorMsg, location); } if (!supportsExtension(location)) { final String errorMsg = "No parser available to parse extension of url path"; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, "wrong extension"); + throw new ParserException(errorMsg, location); } if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); Idiom parser = mime2parser.get(normalizeMimeType(mimeType)); @@ -204,7 +204,7 @@ public final class Parser { } else { final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location, "wrong mime type or wrong extension"); + throw new ParserException(errorMsg, location); } if (doc == null) { final String errorMsg = "Unexpected error. Parser returned null."; @@ -217,11 +217,12 @@ public final class Parser { if (e instanceof ParserException) throw (ParserException) e; final String errorMsg = "Unexpected exception. " + e.getMessage(); log.logSevere("Unable to parse '" + location + "'. " + errorMsg, e); - throw new ParserException(errorMsg, location, e); + throw new ParserException(errorMsg, location); } } public static boolean supportsMime(String mimeType) { + mimeType = normalizeMimeType(mimeType); return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType)); } @@ -249,7 +250,7 @@ public final class Parser { public static void setDenyMime(String denyList) { denyMime.clear(); - for (String s: denyList.split(",")) denyMime.add(s); + for (String s: denyList.split(",")) denyMime.add(normalizeMimeType(s)); } public static String getDenyMime() { @@ -260,6 +261,6 @@ public final class Parser { } public static void grantMime(String mime, boolean grant) { - if (grant) denyMime.remove(mime); else denyMime.add(mime); + if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime)); } } diff --git a/source/de/anomic/document/ParserException.java b/source/de/anomic/document/ParserException.java index 01a6a20cb..ae90688c0 100644 --- a/source/de/anomic/document/ParserException.java +++ b/source/de/anomic/document/ParserException.java @@ -26,9 +26,7 @@ package de.anomic.document; import de.anomic.yacy.yacyURL; -public class ParserException extends Exception -{ - private String errorCode = null; +public class ParserException extends Exception { private yacyURL url = null; private static final long serialVersionUID = 1L; @@ -38,28 +36,9 @@ public class ParserException extends Exception } public ParserException(final String message, final yacyURL url) { - this(message,url, "parser error for url " + url.toString()); - } - - public ParserException(final String message, final yacyURL url, final String errorCode) { - super(message); - this.errorCode = errorCode; - this.url = url; - } - - public ParserException(final String message, final yacyURL url, final Throwable cause) { - this(message,url,cause, "parser error for url " + url.toString()); - } - - public ParserException(final String message, final yacyURL url, final Throwable cause, final String errorCode) { - super(message, cause); - this.errorCode = errorCode; + super(message + "; url = " + url.toNormalform(true, false)); this.url = url; } - - public String getErrorCode() { - return this.errorCode; - } public yacyURL getURL() { return this.url; diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java index 2b5321cc0..0fec1c9e1 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/de/anomic/document/parser/bzipParser.java @@ -52,10 +52,8 @@ public class bzipParser extends AbstractParser implements Idiom { static { SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); - SUPPORTED_MIME_TYPES.put("application/octet-stream",fileExtensions); + SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-compressed",fileExtensions); SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions); } diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java index 41b47d136..54413fca0 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/de/anomic/document/parser/docParser.java @@ -27,7 +27,9 @@ package de.anomic.document.parser; +import java.io.IOException; import java.io.InputStream; +import java.io.UnsupportedEncodingException; import java.util.HashMap; import org.textmining.extraction.TextExtractor; import org.textmining.extraction.word.WordTextExtractorFactory; @@ -62,23 +64,31 @@ public class docParser extends AbstractParser implements Idiom { super("Word Document Parser"); } - public Document parse(final yacyURL location, final String mimeType, final String charset, - final InputStream source) throws ParserException, InterruptedException { - - - try { - final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory(); - final TextExtractor extractor = extractorFactory.textExtractor(source); - final String contents = extractor.getText().trim(); - String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); - if (title.length() > 80) title = title.substring(0, 80); - int l = title.length(); - while (true) { - title = title.replaceAll(" ", " "); - if (title.length() == l) break; - l = title.length(); - } - final Document theDoc = new Document( + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { + final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory(); + TextExtractor extractor = null; + try { + extractor = extractorFactory.textExtractor(source); + } catch (Exception e) { + throw new ParserException("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); + } + String contents = null; + try { + contents = extractor.getText().trim(); + } catch (IOException e) { + throw new ParserException("error in docParser, getText: " + e.getMessage(), location); + } + String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); + if (title.length() > 80) title = title.substring(0, 80); + int l = title.length(); + while (true) { + title = title.replaceAll(" ", " "); + if (title.length() == l) break; + l = title.length(); + } + Document theDoc; + try { + theDoc = new Document( location, mimeType, "UTF-8", @@ -91,15 +101,11 @@ public class docParser extends AbstractParser implements Idiom { contents.getBytes("UTF-8"), null, null); - - return theDoc; - } catch (final Exception e) { - e.printStackTrace(); - if (e instanceof InterruptedException) throw (InterruptedException) e; - if (e instanceof ParserException) throw (ParserException) e; - - throw new ParserException("Unexpected error while parsing doc file. " + e.getMessage(),location); - } + } catch (UnsupportedEncodingException e) { + throw new ParserException("error in docParser, getBytes: " + e.getMessage(), location); + } + + return theDoc; } public HashMap getSupportedMimeTypes() { diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java index 78aa2f491..d3e7446c7 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/de/anomic/document/parser/gzipParser.java @@ -59,7 +59,6 @@ public class gzipParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.put("application/x-compress",ext); SUPPORTED_MIME_TYPES.put("gzip/document",ext); SUPPORTED_MIME_TYPES.put("application/octet-stream",ext); - SUPPORTED_MIME_TYPES.put("application/x-tar",ext); } public gzipParser() { diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java index 743226dda..b38b06802 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/de/anomic/document/parser/htmlParser.java @@ -50,7 +50,7 @@ public class htmlParser extends AbstractParser implements Idiom { */ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { - String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp"; + String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv"; SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext); SUPPORTED_MIME_TYPES.put("text/html", ext); SUPPORTED_MIME_TYPES.put("text/plain", ext); diff --git a/source/de/anomic/document/parser/mimeTypeParser.java b/source/de/anomic/document/parser/mimeTypeParser.java index c4c568e17..5f066c95b 100644 --- a/source/de/anomic/document/parser/mimeTypeParser.java +++ b/source/de/anomic/document/parser/mimeTypeParser.java @@ -60,9 +60,6 @@ public class mimeTypeParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.put("text/xml","xml"); SUPPORTED_MIME_TYPES.put("application/xml","xml"); SUPPORTED_MIME_TYPES.put("application/x-xml","xml"); - SUPPORTED_MIME_TYPES.put("application/octet-stream","xml"); - SUPPORTED_MIME_TYPES.put("application/x-compress","xml"); - SUPPORTED_MIME_TYPES.put("application/x-compressed","xml"); } /** diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index c7119289c..ab91ead4e 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -65,8 +65,8 @@ public class odtParser extends AbstractParser implements Idiom { */ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); static { - SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt"); - SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt"); + SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp"); + SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp"); } public odtParser() { diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java index 78a5a589f..1f3fad031 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/de/anomic/document/parser/pdfParser.java @@ -107,7 +107,7 @@ public class pdfParser extends AbstractParser implements Idiom { theDocument.openProtection(new StandardDecryptionMaterial("")); final AccessPermission perm = theDocument.getCurrentAccessPermission(); if (perm == null || !perm.canExtractContent()) - throw new ParserException("Document is encrypted",location, "document is exncrypted"); + throw new ParserException("Document is encrypted", location); } // extracting some metadata diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java index 3729182ed..84067e1c7 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/de/anomic/document/parser/pptParser.java @@ -45,7 +45,7 @@ public class pptParser extends AbstractParser implements Idiom { * @see #getSupportedMimeTypes() */ public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static final String ext = "ppt,pps"; + static final String ext = "ppt,pptx,pps"; static { SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext); SUPPORTED_MIME_TYPES.put("application/powerpoint",ext); diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index be6674ce9..7a1652bf9 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -282,7 +282,7 @@ public class psParser extends AbstractParser implements Idiom { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof ParserException) throw (ParserException) e; - throw new ParserException("Unable to parse the ps file. " + e.getMessage(),location, e); + throw new ParserException("Unable to parse the ps file. " + e.getMessage(), location); } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index 30e70894c..f88cfb2f0 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -50,8 +50,6 @@ public class rtfParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.put("text/rtf","rtf"); SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf"); SUPPORTED_MIME_TYPES.put("text/richtext","rtf"); - SUPPORTED_MIME_TYPES.put("application/msword","rtf"); - SUPPORTED_MIME_TYPES.put("application/doc","rtf"); SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf"); } diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java index 2d3fa7af4..a4da103fa 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/de/anomic/document/parser/sevenzipParser.java @@ -72,7 +72,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { try { archive = new Handler(source); } catch (final IOException e) { - throw new ParserException("error opening 7zip archive", location, e); + throw new ParserException("error opening 7zip archive: " + e.getMessage(), location); } checkInterruption(); final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive, @@ -87,8 +87,8 @@ public class sevenzipParser extends AbstractParser implements Idiom { if (e.getCause() instanceof ParserException) throw (ParserException)e.getCause(); throw new ParserException( - "error processing 7zip archive at internal file: " + aec.getCurrentFilePath(), - location, e); + "error processing 7zip archive at internal file " + aec.getCurrentFilePath() + ": " + e.getMessage(), + location); } finally { try { archive.close(); } catch (final IOException e) { } } @@ -106,7 +106,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { try { return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE); } catch (final IOException e) { - throw new ParserException("error processing 7zip archive", location, e); + throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); } } @@ -120,7 +120,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { } return parse(location, mimeType, charset, cfos.getContentBAOS()); } catch (final IOException e) { - throw new ParserException("error processing 7zip archive", location, e); + throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); } } diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java index 64113b8b7..e7a8027ed 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/de/anomic/document/parser/tarParser.java @@ -64,8 +64,6 @@ public class tarParser extends AbstractParser implements Idiom { SUPPORTED_MIME_TYPES.put("application/tar","tar"); SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar"); SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar"); - SUPPORTED_MIME_TYPES.put("application/x-compress","tar"); - SUPPORTED_MIME_TYPES.put("application/x-compressed","tar"); } public tarParser() { diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 2613f4c31..e5bdcee7e 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1664,7 +1664,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch