From b2263bc720c473854b22c2fccb8c124cff0f7a81 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 14 Jul 2009 11:01:05 +0000 Subject: [PATCH] enhanced document type recognition git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6209 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 1 + htroot/ConfigParser.java | 6 +- source/de/anomic/crawler/FTPLoader.java | 16 +- source/de/anomic/crawler/HTTPLoader.java | 10 +- source/de/anomic/crawler/IndexingStack.java | 9 +- source/de/anomic/document/Idiom.java | 15 +- source/de/anomic/document/Parser.java | 151 ++++++++++++------ .../de/anomic/document/parser/bzipParser.java | 29 ++-- .../de/anomic/document/parser/docParser.java | 37 +++-- .../de/anomic/document/parser/gzipParser.java | 32 ++-- .../de/anomic/document/parser/htmlParser.java | 37 +++-- .../de/anomic/document/parser/odtParser.java | 23 ++- .../de/anomic/document/parser/pdfParser.java | 26 +-- .../de/anomic/document/parser/pptParser.java | 35 ++-- .../de/anomic/document/parser/psParser.java | 24 ++- .../de/anomic/document/parser/rpmParser.java | 21 ++- .../de/anomic/document/parser/rssParser.java | 27 ++-- .../de/anomic/document/parser/rtfParser.java | 30 ++-- .../document/parser/sevenzipParser.java | 18 ++- .../de/anomic/document/parser/swfParser.java | 24 +-- .../de/anomic/document/parser/tarParser.java | 22 ++- .../de/anomic/document/parser/vcfParser.java | 28 ++-- .../de/anomic/document/parser/vsdParser.java | 35 ++-- .../de/anomic/document/parser/xlsParser.java | 32 ++-- .../de/anomic/document/parser/zipParser.java | 31 ++-- source/de/anomic/http/httpdProxyHandler.java | 6 +- source/de/anomic/plasma/plasmaHTCache.java | 5 - .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- source/de/anomic/search/SnippetCache.java | 16 +- 29 files changed, 480 insertions(+), 274 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 791381c24..5abb2b88f 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -250,6 +250,7 @@ minimumGlobalDelta = 500 # the following mime-types are a blacklist for indexing: # parser.mime.deny: specifies mime-types that shall not be indexed parser.mime.deny= +parser.extensions.deny= # Promotion Strings # These strings appear in the Web Mask of the YACY search client diff --git a/htroot/ConfigParser.java b/htroot/ConfigParser.java index 7d1775a5d..c4bfc5002 100644 --- a/htroot/ConfigParser.java +++ b/htroot/ConfigParser.java @@ -49,7 +49,7 @@ public class ConfigParser { post.remove("parserSettings"); for (Idiom parser: Parser.idioms()) { - for (String mimeType: parser.getSupportedMimeTypes().keySet()) { + for (String mimeType: parser.supportedMimeTypes()) { Parser.grantMime(mimeType, post.get("mimename_" + mimeType, "").equals("on")); } } @@ -62,9 +62,9 @@ public class ConfigParser { prop.put("parser_" + i + "_name", parser.getName()); int mimeIdx = 0; - for (String mimeType: parser.getSupportedMimeTypes().keySet()) { + for (String mimeType: parser.supportedMimeTypes()) { prop.put("parser_" + i + "_mime_" + mimeIdx + "_mimetype", mimeType); - prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType)) ? 1 : 0); + prop.put("parser_" + i + "_mime_" + mimeIdx + "_status", (Parser.supportsMime(mimeType) == null) ? 1 : 0); mimeIdx++; } prop.put("parser_" + i + "_mime", mimeIdx); diff --git a/source/de/anomic/crawler/FTPLoader.java b/source/de/anomic/crawler/FTPLoader.java index f501c247b..e0eec059e 100644 --- a/source/de/anomic/crawler/FTPLoader.java +++ b/source/de/anomic/crawler/FTPLoader.java @@ -224,16 +224,12 @@ public class FTPLoader { // if the mimetype and file extension is supported we start to download // the file httpDocument htCache = null; - if (!Parser.supportsExtension(entryUrl)) { - // if the response has not the right file type then reject file - log.logInfo("REJECTED WRONG EXTENSION TYPE " + mimeType + " for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); - throw new Exception("response has not the right extension type -> rejected"); - } else if (!Parser.supportsMime(mimeType)) { - // if the response has not the right file type then reject file - log.logInfo("REJECTED WRONG MIME TYPE " + mimeType + " for URL " + entry.url().toString()); - sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); - throw new Exception("response has not the right mime type -> rejected"); + String supportError = Parser.supports(entryUrl, mimeType); + if (supportError != null) { + // reject file + log.logInfo("PARSER REJECTED URL " + entry.url().toString() + ": " + supportError); + sb.crawlQueues.errorURL.newEntry(entry, this.sb.peers.mySeed().hash, new Date(), 1, supportError); + throw new Exception(supportError); } else { // abort the download if content is too long final int size = ftpClient.fileSize(path); diff --git a/source/de/anomic/crawler/HTTPLoader.java b/source/de/anomic/crawler/HTTPLoader.java index cb083537e..acdadf150 100644 --- a/source/de/anomic/crawler/HTTPLoader.java +++ b/source/de/anomic/crawler/HTTPLoader.java @@ -120,8 +120,9 @@ public final class HTTPLoader { if (port < 0) port = (ssl) ? 443 : 80; // if not the right file type then reject file - if (!Parser.supportsExtension(entry.url())) { - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong extension"); + String supportError = Parser.supportsExtension(entry.url()); + if (supportError != null) { + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); throw new IOException("REJECTED WRONG EXTENSION TYPE " + entry.url().getFileExtension()+ " for URL " + entry.url().toString()); } @@ -166,8 +167,9 @@ public final class HTTPLoader { //try { // if the response has not the right file type then reject file - if (!Parser.supportsMime(res.getResponseHeader().mime())) { - sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, "wrong mime type"); + supportError = Parser.supports(entry.url(), res.getResponseHeader().mime()); + if (supportError != null) { + sb.crawlQueues.errorURL.newEntry(entry, sb.peers.mySeed().hash, new Date(), 1, supportError); throw new IOException("REJECTED WRONG MIME TYPE " + res.getResponseHeader().mime() + " for URL " + entry.url().toString()); } diff --git a/source/de/anomic/crawler/IndexingStack.java b/source/de/anomic/crawler/IndexingStack.java index 47b1aa19b..969dec3cb 100644 --- a/source/de/anomic/crawler/IndexingStack.java +++ b/source/de/anomic/crawler/IndexingStack.java @@ -34,6 +34,7 @@ import java.util.Date; import java.util.Iterator; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.document.Parser; import de.anomic.http.httpHeader; import de.anomic.http.httpResponseHeader; import de.anomic.kelondro.index.Row; @@ -480,8 +481,9 @@ public class IndexingStack { if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; } - if (!plasmaHTCache.isText(mimeType)) { - return "Media_Content_(not_text)"; + String parserError = Parser.supportsMime(mimeType); + if (parserError != null) { + return "Media_Content, no parser: " + parserError; } // -if-modified-since in request @@ -598,7 +600,8 @@ public class IndexingStack { if (responseHeader != null) { final String mimeType = responseHeader.mime(); if (plasmaHTCache.isPicture(mimeType)) { return "Media_Content_(Picture)"; } - if (!plasmaHTCache.isText(mimeType)) { return "Media_Content_(not_text)"; } + String parserError = Parser.supportsMime(mimeType); + if (parserError != null) { return "Media_Content, parser error: " + parserError; } } if (plasmaHTCache.noIndexingURL(url())) { return "Media_Content_(forbidden)"; } diff --git a/source/de/anomic/document/Idiom.java b/source/de/anomic/document/Idiom.java index 5ab8405ee..de54e34f9 100644 --- a/source/de/anomic/document/Idiom.java +++ b/source/de/anomic/document/Idiom.java @@ -27,8 +27,7 @@ package de.anomic.document; import java.io.File; import java.io.InputStream; -import java.util.HashMap; -import java.util.Hashtable; +import java.util.Set; import de.anomic.yacy.yacyURL; @@ -87,11 +86,15 @@ public interface Idiom { /** * Get the MimeType(s) that are supported by the parser - * @return a {@link Hashtable} containing a mapping from a mime type string - * to a comma-separated String of file extensions - * that are supported by the idiom parser + * @return a set of strings denoting the supported mime types */ - public HashMap getSupportedMimeTypes(); + public Set supportedMimeTypes(); + + /** + * Get the File extension(s) that are supported by the parser + * @return a set of strings denoting the supported file extensions + */ + public Set supportedExtensions(); /** * This function should be called before reusing the parser object. diff --git a/source/de/anomic/document/Parser.java b/source/de/anomic/document/Parser.java index 002974d1d..c316f0137 100644 --- a/source/de/anomic/document/Parser.java +++ b/source/de/anomic/document/Parser.java @@ -72,8 +72,9 @@ public final class Parser { } private static final Map mime2parser = new TreeMap(insensitiveCollator); - private static final Map> ext2mime = new TreeMap>(insensitiveCollator); + private static final Map ext2mime = new TreeMap(insensitiveCollator); private static final Set denyMime = new TreeSet(insensitiveCollator); + private static final Set denyExtension = new TreeSet(insensitiveCollator); static { initParser(new bzipParser()); @@ -103,21 +104,20 @@ public final class Parser { } private static void initParser(Idiom parser) { - for (Map.Entry e: parser.getSupportedMimeTypes().entrySet()) { + String prototypeMime = null; + for (String mime: parser.supportedMimeTypes()) { // process the mime types - final String mimeType = normalizeMimeType(e.getKey()); + final String mimeType = normalizeMimeType(mime); + if (prototypeMime == null) prototypeMime = mimeType; Idiom p0 = mime2parser.get(mimeType); if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'."); mime2parser.put(mimeType, parser); Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName()); - // process the extensions - String[] exts = e.getValue().split(","); - for (String ext: exts) { - Set s = ext2mime.get(ext); - if (s == null) s = new HashSet(); - s.add(mimeType); - ext2mime.put(ext, s); + if (prototypeMime != null) for (String ext: parser.supportedExtensions()) { + String s = ext2mime.get(ext); + if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'."); + ext2mime.put(ext, prototypeMime); } } } @@ -148,9 +148,12 @@ public final class Parser { } } - public static Document parseSource(final yacyURL location, - final String mimeType, final String charset, - final File sourceFile) throws InterruptedException, ParserException { + public static Document parseSource( + final yacyURL location, + final String mimeType, + final String charset, + final File sourceFile + ) throws InterruptedException, ParserException { BufferedInputStream sourceStream = null; try { @@ -174,39 +177,33 @@ public final class Parser { } } - public static Document parseSource(final yacyURL location, - String mimeType, final String charset, - final long contentLength, final InputStream sourceStream) - throws InterruptedException, ParserException { + public static Document parseSource( + final yacyURL location, + String mimeType, + final String charset, + final long contentLength, + final InputStream sourceStream + ) throws InterruptedException, ParserException { try { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); final String fileExt = location.getFileExtension(); final String documentCharset = htmlParser.patchCharsetEncoding(charset); - if (!supportsMime(mimeType)) { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "'"; - log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location); - } - if (!supportsExtension(location)) { - final String errorMsg = "No parser available to parse extension of url path"; + Idiom parser = idiomParser(location, mimeType); + + if (parser == null) { + final String errorMsg = "No parser available to parse extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "'"; log.logInfo("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location); } + if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); - Idiom parser = mime2parser.get(normalizeMimeType(mimeType)); - Document doc = null; - if (parser != null) { - parser.setContentLength(contentLength); - doc = parser.parse(location, mimeType, documentCharset, sourceStream); - } else { - final String errorMsg = "No parser available to parse mimetype '" + mimeType + "' (2)"; - log.logInfo("Unable to parse '" + location + "'. " + errorMsg); - throw new ParserException(errorMsg, location); - } + parser.setContentLength(contentLength); + Document doc = parser.parse(location, mimeType, documentCharset, sourceStream); + if (doc == null) { - final String errorMsg = "Unexpected error. Parser returned null."; - log.logInfo("Unable to parse '" + location + "'. " + errorMsg); + final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed: document == null"; + log.logWarning("Unable to parse '" + location + "'. " + errorMsg); throw new ParserException(errorMsg, location); } return doc; @@ -218,16 +215,66 @@ public final class Parser { throw new ParserException(errorMsg, location); } } - - public static boolean supportsMime(String mimeType) { + + /** + * check if the parser supports the given content. + * @param url + * @param mimeType + * @return returns null if the content is supportet. If the content is not supported, return a error string. + */ + public static String supports(final yacyURL url, String mimeType) { + try { + // try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok. + idiomParser(url, mimeType); + return null; + } catch (ParserException e) { + // in case that a parser is not available, return a error string describing the problem. + return e.getMessage(); + } + } + + private static Idiom idiomParser(final yacyURL url, String mimeType) throws ParserException { + // check mime type + if (mimeType != null) { + mimeType = normalizeMimeType(mimeType); + if (denyMime.contains(mimeType)) throw new ParserException("mime type '" + mimeType + "' is denied", url); + } else { + mimeType = normalizeMimeType(mimeType); + } + + Idiom idiom = mime2parser.get(mimeType); + if (idiom != null) return idiom; + + // check extension + String ext = url.getFileExtension(); + if (ext == null || ext.length() == 0) throw new ParserException("no file extension", url); + if (denyExtension.contains(ext)) throw new ParserException("file extension '" + ext + "' is denied", url); + mimeType = ext2mime.get(ext); + if (mimeType == null) throw new ParserException("no parser available", url); + idiom = mime2parser.get(mimeType); + assert idiom != null; + if (idiom == null) throw new ParserException("no parser available (internal error!)", url); + return idiom; + } + + public static String supportsMime(String mimeType) { + if (mimeType == null) return null; mimeType = normalizeMimeType(mimeType); - return !denyMime.contains(mimeType) && mime2parser.containsKey(normalizeMimeType(mimeType)); + if (denyMime.contains(mimeType)) return "mime type '" + mimeType + "' is denied"; + if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available"; + return null; } - public static boolean supportsExtension(final yacyURL url) { + public static String supportsExtension(final yacyURL url) { String ext = url.getFileExtension(); - if (ext.length() == 0) return true; // may be anything; thats ok if the mime type is ok - return ext2mime.containsKey(ext); + if (ext == null || ext.length() == 0) return null; + if (denyExtension.contains(ext)) return "file extension '" + ext + "' is denied"; + String mimeType = ext2mime.get(ext); + if (mimeType == null) return "no parser available"; + Idiom idiom = mime2parser.get(mimeType); + assert idiom != null; + if (idiom == null) return "no parser available (internal error!)"; + return null; } public static String mimeOf(yacyURL url) { @@ -235,9 +282,7 @@ public final class Parser { } public static String mimeOf(String ext) { - Set mimes = ext2mime.get(ext); - if (mimes == null) return null; - return mimes.iterator().next(); + return ext2mime.get(ext); } private static String normalizeMimeType(String mimeType) { @@ -261,4 +306,20 @@ public final class Parser { public static void grantMime(String mime, boolean grant) { if (grant) denyMime.remove(normalizeMimeType(mime)); else denyMime.add(normalizeMimeType(mime)); } + + public static void setDenyExtension(String denyList) { + denyExtension.clear(); + for (String s: denyList.split(",")) denyExtension.add(s); + } + + public static String getDenyExtension() { + String s = ""; + for (String d: denyExtension) s += d + ","; + s = s.substring(0, s.length() - 1); + return s; + } + + public static void grantExtension(String ext, boolean grant) { + if (grant) denyExtension.remove(ext); else denyExtension.add(ext); + } } diff --git a/source/de/anomic/document/parser/bzipParser.java b/source/de/anomic/document/parser/bzipParser.java index 0fec1c9e1..a0b4a3056 100644 --- a/source/de/anomic/document/parser/bzipParser.java +++ b/source/de/anomic/document/parser/bzipParser.java @@ -30,7 +30,9 @@ package de.anomic.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import org.apache.tools.bzip2.CBZip2InputStream; import de.anomic.document.AbstractParser; @@ -47,24 +49,31 @@ public class bzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static final String fileExtensions = "bz2,tbz,tbz2"; - static { - SUPPORTED_MIME_TYPES.put("application/x-bzip2",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/bzip2", fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-bz2", fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-bzip",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/x-stuffit",fileExtensions); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("bz2"); + SUPPORTED_EXTENSIONS.add("tbz"); + SUPPORTED_EXTENSIONS.add("tbz2"); + SUPPORTED_MIME_TYPES.add("application/x-bzip2"); + SUPPORTED_MIME_TYPES.add("application/bzip2"); + SUPPORTED_MIME_TYPES.add("application/x-bz2"); + SUPPORTED_MIME_TYPES.add("application/x-bzip"); + SUPPORTED_MIME_TYPES.add("application/x-stuffit"); } public bzipParser() { super("Bzip 2 UNIX Compressed File Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File tempFile = null; diff --git a/source/de/anomic/document/parser/docParser.java b/source/de/anomic/document/parser/docParser.java index fafd07133..423b3197f 100644 --- a/source/de/anomic/document/parser/docParser.java +++ b/source/de/anomic/document/parser/docParser.java @@ -30,7 +30,8 @@ package de.anomic.document.parser; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; @@ -45,18 +46,20 @@ public class docParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - String ext = "doc,docx"; - SUPPORTED_MIME_TYPES.put("application/msword",ext); - SUPPORTED_MIME_TYPES.put("application/doc",ext); - SUPPORTED_MIME_TYPES.put("appl/text",ext); - SUPPORTED_MIME_TYPES.put("application/vnd.msword",ext); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-word",ext); - SUPPORTED_MIME_TYPES.put("application/winword",ext); - SUPPORTED_MIME_TYPES.put("application/word",ext); - SUPPORTED_MIME_TYPES.put("application/x-msw6",ext); - SUPPORTED_MIME_TYPES.put("application/x-msword",ext); + SUPPORTED_EXTENSIONS.add("doc"); + SUPPORTED_EXTENSIONS.add("docx"); + SUPPORTED_MIME_TYPES.add("application/msword"); + SUPPORTED_MIME_TYPES.add("application/doc"); + SUPPORTED_MIME_TYPES.add("appl/text"); + SUPPORTED_MIME_TYPES.add("application/vnd.msword"); + SUPPORTED_MIME_TYPES.add("application/vnd.ms-word"); + SUPPORTED_MIME_TYPES.add("application/winword"); + SUPPORTED_MIME_TYPES.add("application/word"); + SUPPORTED_MIME_TYPES.add("application/x-msw6"); + SUPPORTED_MIME_TYPES.add("application/x-msword"); } public docParser() { @@ -115,9 +118,13 @@ public class docParser extends AbstractParser implements Idiom { return theDoc; } - public HashMap getSupportedMimeTypes() { - return docParser.SUPPORTED_MIME_TYPES; - } + public Set supportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } @Override public void reset() { diff --git a/source/de/anomic/document/parser/gzipParser.java b/source/de/anomic/document/parser/gzipParser.java index d3e7446c7..2963677b2 100644 --- a/source/de/anomic/document/parser/gzipParser.java +++ b/source/de/anomic/document/parser/gzipParser.java @@ -30,7 +30,8 @@ package de.anomic.document.parser; import java.io.File; import java.io.FileOutputStream; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; import java.util.zip.GZIPInputStream; import de.anomic.document.AbstractParser; @@ -47,28 +48,31 @@ public class gzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static final String ext = "gz,tgz"; - static { - SUPPORTED_MIME_TYPES.put("application/x-gzip",ext); - SUPPORTED_MIME_TYPES.put("application/gzip",ext); - SUPPORTED_MIME_TYPES.put("application/x-gunzip",ext); - SUPPORTED_MIME_TYPES.put("application/gzipped",ext); - SUPPORTED_MIME_TYPES.put("application/gzip-compressed",ext); - SUPPORTED_MIME_TYPES.put("application/x-compressed",ext); - SUPPORTED_MIME_TYPES.put("application/x-compress",ext); - SUPPORTED_MIME_TYPES.put("gzip/document",ext); - SUPPORTED_MIME_TYPES.put("application/octet-stream",ext); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("gz"); + SUPPORTED_EXTENSIONS.add("tgz"); + SUPPORTED_MIME_TYPES.add("application/x-gzip"); + SUPPORTED_MIME_TYPES.add("application/gzip"); + SUPPORTED_MIME_TYPES.add("application/x-gunzip"); + SUPPORTED_MIME_TYPES.add("application/gzipped"); + SUPPORTED_MIME_TYPES.add("application/gzip-compressed"); + SUPPORTED_MIME_TYPES.add("gzip/document"); } public gzipParser() { super("GNU Zip Compressed Archive Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { File tempFile = null; diff --git a/source/de/anomic/document/parser/htmlParser.java b/source/de/anomic/document/parser/htmlParser.java index f7e6fadb8..3e686eb93 100644 --- a/source/de/anomic/document/parser/htmlParser.java +++ b/source/de/anomic/document/parser/htmlParser.java @@ -31,7 +31,9 @@ import java.io.InputStream; import java.nio.charset.Charset; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.UnsupportedCharsetException; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import de.anomic.document.AbstractParser; import de.anomic.document.Document; import de.anomic.document.Idiom; @@ -48,17 +50,29 @@ public class htmlParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - String ext = "htm,html,shtml,xhtml,php,asp,aspx,txt,jsp,csv,pl,py"; - SUPPORTED_MIME_TYPES.put("application/xhtml+xml", ext); - SUPPORTED_MIME_TYPES.put("text/html", ext); - SUPPORTED_MIME_TYPES.put("text/plain", ext); - SUPPORTED_MIME_TYPES.put("text/sgml",ext); + SUPPORTED_EXTENSIONS.add("htm"); + SUPPORTED_EXTENSIONS.add("html"); + SUPPORTED_EXTENSIONS.add("shtml"); + SUPPORTED_EXTENSIONS.add("xhtml"); + SUPPORTED_EXTENSIONS.add("php"); + SUPPORTED_EXTENSIONS.add("asp"); + SUPPORTED_EXTENSIONS.add("aspx"); + SUPPORTED_EXTENSIONS.add("txt"); + SUPPORTED_EXTENSIONS.add("jsp"); + SUPPORTED_EXTENSIONS.add("csv"); + SUPPORTED_EXTENSIONS.add("pl"); + SUPPORTED_EXTENSIONS.add("py"); + SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); + SUPPORTED_MIME_TYPES.add("text/html"); + SUPPORTED_MIME_TYPES.add("text/plain"); + SUPPORTED_MIME_TYPES.add("text/sgml"); } public htmlParser() { - super("streaming html parser"); + super("HTML Parser"); } @Override @@ -213,9 +227,12 @@ public class htmlParser extends AbstractParser implements Idiom { return encoding; } - - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + } diff --git a/source/de/anomic/document/parser/odtParser.java b/source/de/anomic/document/parser/odtParser.java index ab91ead4e..a65244f99 100644 --- a/source/de/anomic/document/parser/odtParser.java +++ b/source/de/anomic/document/parser/odtParser.java @@ -35,7 +35,6 @@ import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Enumeration; -import java.util.HashMap; import java.util.HashSet; import java.util.Set; import java.util.zip.ZipEntry; @@ -63,20 +62,30 @@ public class odtParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/vnd.oasis.opendocument.text","odt,ods,odp"); - SUPPORTED_MIME_TYPES.put("application/x-vnd.oasis.opendocument.text","odt,ods,odp"); - } + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("odt"); + SUPPORTED_EXTENSIONS.add("ods"); + SUPPORTED_EXTENSIONS.add("odp"); + SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.text"); + SUPPORTED_MIME_TYPES.add("application/x-vnd.oasis.opendocument.text"); + SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.presentation"); + SUPPORTED_MIME_TYPES.add("application/vnd.oasis.opendocument.spreadsheet"); + } public odtParser() { super("OASIS OpenDocument V2 Text Document Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + @Override public Document parse(final yacyURL location, final String mimeType, final String charset, final File dest) throws ParserException, InterruptedException { diff --git a/source/de/anomic/document/parser/pdfParser.java b/source/de/anomic/document/parser/pdfParser.java index 1f3fad031..e5dd5b089 100644 --- a/source/de/anomic/document/parser/pdfParser.java +++ b/source/de/anomic/document/parser/pdfParser.java @@ -33,7 +33,9 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStreamWriter; import java.io.Writer; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import org.pdfbox.pdfparser.PDFParser; import org.pdfbox.pdmodel.PDDocument; import org.pdfbox.pdmodel.PDDocumentInformation; @@ -55,24 +57,30 @@ public class pdfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - SUPPORTED_MIME_TYPES.put("application/pdf","pdf"); - SUPPORTED_MIME_TYPES.put("application/x-pdf","pdf"); - SUPPORTED_MIME_TYPES.put("application/acrobat","pdf"); - SUPPORTED_MIME_TYPES.put("applications/vnd.pdf","pdf"); - SUPPORTED_MIME_TYPES.put("text/pdf","pdf"); - SUPPORTED_MIME_TYPES.put("text/x-pdf","pdf"); + SUPPORTED_EXTENSIONS.add("pdf"); + SUPPORTED_MIME_TYPES.add("application/pdf"); + SUPPORTED_MIME_TYPES.add("application/x-pdf"); + SUPPORTED_MIME_TYPES.add("application/acrobat"); + SUPPORTED_MIME_TYPES.add("applications/vnd.pdf"); + SUPPORTED_MIME_TYPES.add("text/pdf"); + SUPPORTED_MIME_TYPES.add("text/x-pdf"); } public pdfParser() { super("Acrobat Portable Document Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { PDDocument theDocument = null; diff --git a/source/de/anomic/document/parser/pptParser.java b/source/de/anomic/document/parser/pptParser.java index 84067e1c7..b6d004d89 100644 --- a/source/de/anomic/document/parser/pptParser.java +++ b/source/de/anomic/document/parser/pptParser.java @@ -29,7 +29,9 @@ package de.anomic.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import org.apache.poi.hslf.extractor.PowerPointExtractor; import de.anomic.document.AbstractParser; @@ -44,17 +46,20 @@ public class pptParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static final String ext = "ppt,pptx,pps"; - static { - SUPPORTED_MIME_TYPES.put("application/mspowerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/powerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-powerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/ms-powerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/mspowerpnt",ext); - SUPPORTED_MIME_TYPES.put("application/vnd-mspowerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/x-powerpoint",ext); - SUPPORTED_MIME_TYPES.put("application/x-m",ext); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("ppt"); + SUPPORTED_EXTENSIONS.add("pptx"); + SUPPORTED_EXTENSIONS.add("pps"); + SUPPORTED_MIME_TYPES.add("application/mspowerpoint"); + SUPPORTED_MIME_TYPES.add("application/powerpoint"); + SUPPORTED_MIME_TYPES.add("application/vnd.ms-powerpoint"); + SUPPORTED_MIME_TYPES.add("application/ms-powerpoint"); + SUPPORTED_MIME_TYPES.add("application/mspowerpnt"); + SUPPORTED_MIME_TYPES.add("application/vnd-mspowerpoint"); + SUPPORTED_MIME_TYPES.add("application/x-powerpoint"); + SUPPORTED_MIME_TYPES.add("application/x-m"); } public pptParser(){ @@ -114,9 +119,13 @@ public class pptParser extends AbstractParser implements Idiom { } } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } @Override public void reset(){ diff --git a/source/de/anomic/document/parser/psParser.java b/source/de/anomic/document/parser/psParser.java index 7a1652bf9..c6c8fdf09 100644 --- a/source/de/anomic/document/parser/psParser.java +++ b/source/de/anomic/document/parser/psParser.java @@ -34,7 +34,9 @@ import java.io.FileReader; import java.io.FileWriter; import java.io.InputStream; import java.io.InputStreamReader; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; @@ -48,12 +50,14 @@ public class psParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/ps","ps"); - SUPPORTED_MIME_TYPES.put("application/x-postscript","ps"); - SUPPORTED_MIME_TYPES.put("application/x-ps","ps"); - SUPPORTED_MIME_TYPES.put("application/x-postscript-not-eps","ps"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("ps"); + SUPPORTED_MIME_TYPES.add("application/ps"); + SUPPORTED_MIME_TYPES.add("application/x-postscript"); + SUPPORTED_MIME_TYPES.add("application/x-ps"); + SUPPORTED_MIME_TYPES.add("application/x-postscript-not-eps"); } private final static Object modeScan = new Object(); @@ -69,10 +73,14 @@ public class psParser extends AbstractParser implements Idiom { } } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public boolean testForPs2Ascii() { try { String procOutputLine = null; diff --git a/source/de/anomic/document/parser/rpmParser.java b/source/de/anomic/document/parser/rpmParser.java index 452bc1572..8acec6e59 100644 --- a/source/de/anomic/document/parser/rpmParser.java +++ b/source/de/anomic/document/parser/rpmParser.java @@ -31,6 +31,9 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.InputStream; import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import com.jguild.jrpm.io.RPMFile; import com.jguild.jrpm.io.datatype.DataTypeIf; @@ -55,21 +58,27 @@ public class rpmParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/x-rpm","rpm"); - SUPPORTED_MIME_TYPES.put("application/x-redhat packet manager","rpm"); - SUPPORTED_MIME_TYPES.put("application/x-redhat-package-manager","rpm"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("rpm"); + SUPPORTED_MIME_TYPES.add("application/x-rpm"); + SUPPORTED_MIME_TYPES.add("application/x-redhat packet manager"); + SUPPORTED_MIME_TYPES.add("application/x-redhat-package-manager"); } public rpmParser() { super("rpm Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException { File dstFile = null; diff --git a/source/de/anomic/document/parser/rssParser.java b/source/de/anomic/document/parser/rssParser.java index d893b6ca8..834d16879 100644 --- a/source/de/anomic/document/parser/rssParser.java +++ b/source/de/anomic/document/parser/rssParser.java @@ -33,8 +33,10 @@ import java.io.InputStream; import java.io.Writer; import java.nio.charset.Charset; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.Map; +import java.util.Set; import de.anomic.content.RSSMessage; import de.anomic.document.AbstractParser; @@ -58,13 +60,16 @@ public class rssParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static final String fileExtensions = "xml,rss,rdf"; + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - SUPPORTED_MIME_TYPES.put("text/rss",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/rdf+xml",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/rss+xml",fileExtensions); - SUPPORTED_MIME_TYPES.put("application/atom+xml",fileExtensions); + SUPPORTED_EXTENSIONS.add("xml"); + SUPPORTED_EXTENSIONS.add("rss"); + SUPPORTED_EXTENSIONS.add("rdf"); + SUPPORTED_MIME_TYPES.add("text/rss"); + SUPPORTED_MIME_TYPES.add("application/rdf+xml"); + SUPPORTED_MIME_TYPES.add("application/rss+xml"); + SUPPORTED_MIME_TYPES.add("application/atom+xml"); } public rssParser() { @@ -174,9 +179,13 @@ public class rssParser extends AbstractParser implements Idiom { } } - public HashMap getSupportedMimeTypes() { - return SUPPORTED_MIME_TYPES; - } + public Set supportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } @Override public void reset() { diff --git a/source/de/anomic/document/parser/rtfParser.java b/source/de/anomic/document/parser/rtfParser.java index f88cfb2f0..f795904c2 100644 --- a/source/de/anomic/document/parser/rtfParser.java +++ b/source/de/anomic/document/parser/rtfParser.java @@ -28,7 +28,9 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; @@ -44,13 +46,15 @@ public class rtfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/rtf","rtf"); - SUPPORTED_MIME_TYPES.put("text/rtf","rtf"); - SUPPORTED_MIME_TYPES.put("application/x-rtf","rtf"); - SUPPORTED_MIME_TYPES.put("text/richtext","rtf"); - SUPPORTED_MIME_TYPES.put("application/x-soffice","rtf"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("rtf"); + SUPPORTED_MIME_TYPES.add("application/rtf"); + SUPPORTED_MIME_TYPES.add("text/rtf"); + SUPPORTED_MIME_TYPES.add("application/x-rtf"); + SUPPORTED_MIME_TYPES.add("text/richtext"); + SUPPORTED_MIME_TYPES.add("application/x-soffice"); } public rtfParser() { @@ -96,9 +100,13 @@ public class rtfParser extends AbstractParser implements Idiom { } } - public HashMap getSupportedMimeTypes() { - return rtfParser.SUPPORTED_MIME_TYPES; - } + public Set supportedMimeTypes() { + return SUPPORTED_MIME_TYPES; + } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } public void reset() { // Nothing todo here at the moment diff --git a/source/de/anomic/document/parser/sevenzipParser.java b/source/de/anomic/document/parser/sevenzipParser.java index a4da103fa..2d4d2fd05 100644 --- a/source/de/anomic/document/parser/sevenzipParser.java +++ b/source/de/anomic/document/parser/sevenzipParser.java @@ -32,7 +32,9 @@ import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import SevenZip.ArchiveExtractCallback; import SevenZip.IInStream; import SevenZip.MyRandomAccessFile; @@ -55,9 +57,11 @@ public class sevenzipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/x-7z-compressed", "7z"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("7z"); + SUPPORTED_MIME_TYPES.add("application/x-7z-compressed"); } public sevenzipParser() { @@ -124,10 +128,14 @@ public class sevenzipParser extends AbstractParser implements Idiom { } } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + // wrapper class to redirect output of standard ArchiveExtractCallback to serverLog // and parse the extracted content diff --git a/source/de/anomic/document/parser/swfParser.java b/source/de/anomic/document/parser/swfParser.java index 35caacab7..a2a60cc88 100644 --- a/source/de/anomic/document/parser/swfParser.java +++ b/source/de/anomic/document/parser/swfParser.java @@ -29,6 +29,9 @@ package de.anomic.document.parser; import java.io.InputStream; import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import pt.tumba.parser.swf.SWF2HTML; import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; @@ -42,24 +45,27 @@ public class swfParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash","swf"); - SUPPORTED_MIME_TYPES.put("application/x-shockwave-flash2-preview","swf"); - SUPPORTED_MIME_TYPES.put("application/futuresplash","swf"); - SUPPORTED_MIME_TYPES.put("image/vnd.rn-realflash","swf"); + SUPPORTED_EXTENSIONS.add("swf"); + SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash"); + SUPPORTED_MIME_TYPES.add("application/x-shockwave-flash2-preview"); + SUPPORTED_MIME_TYPES.add("application/futuresplash"); + SUPPORTED_MIME_TYPES.add("image/vnd.rn-realflash"); } public swfParser() { super("Adobe Flash Parser"); } - /** - * returns a hashtable containing the mimetypes that are supported by this class - */ - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } /* * parses the source documents and returns a plasmaParserDocument containing diff --git a/source/de/anomic/document/parser/tarParser.java b/source/de/anomic/document/parser/tarParser.java index e7a8027ed..2b8ed3b34 100644 --- a/source/de/anomic/document/parser/tarParser.java +++ b/source/de/anomic/document/parser/tarParser.java @@ -34,8 +34,10 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.Map; +import java.util.Set; import java.util.zip.GZIPInputStream; import com.ice.tar.TarEntry; @@ -58,22 +60,28 @@ public class tarParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/x-tar","tar"); - SUPPORTED_MIME_TYPES.put("application/tar","tar"); - SUPPORTED_MIME_TYPES.put("applicaton/x-gtar","tar"); - SUPPORTED_MIME_TYPES.put("multipart/x-tar","tar"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("tar"); + SUPPORTED_MIME_TYPES.add("application/x-tar"); + SUPPORTED_MIME_TYPES.add("application/tar"); + SUPPORTED_MIME_TYPES.add("applicaton/x-gtar"); + SUPPORTED_MIME_TYPES.add("multipart/x-tar"); } public tarParser() { super("Tape Archive File Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, InputStream source) throws ParserException, InterruptedException { long docTextLength = 0; diff --git a/source/de/anomic/document/parser/vcfParser.java b/source/de/anomic/document/parser/vcfParser.java index f2ad16267..5967e0bbb 100644 --- a/source/de/anomic/document/parser/vcfParser.java +++ b/source/de/anomic/document/parser/vcfParser.java @@ -33,8 +33,10 @@ import java.io.InputStream; import java.io.InputStreamReader; import java.net.MalformedURLException; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; +import java.util.Set; import de.anomic.crawler.HTTPLoader; import de.anomic.document.AbstractParser; @@ -60,25 +62,31 @@ public class vcfParser extends AbstractParser implements Idiom { * * TODO: support of x-mozilla-cpt and x-mozilla-html tags */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("text/x-vcard","vcf"); - SUPPORTED_MIME_TYPES.put("application/vcard","vcf"); - SUPPORTED_MIME_TYPES.put("text/anytext","vcf"); - SUPPORTED_MIME_TYPES.put("text/directory","vcf"); - SUPPORTED_MIME_TYPES.put("application/x-versit","vcf"); - SUPPORTED_MIME_TYPES.put("text/x-versit","vcf"); - SUPPORTED_MIME_TYPES.put("text/x-vcalendar","vcf"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("vcf"); + SUPPORTED_MIME_TYPES.add("text/x-vcard"); + SUPPORTED_MIME_TYPES.add("application/vcard"); + SUPPORTED_MIME_TYPES.add("text/anytext"); + SUPPORTED_MIME_TYPES.add("text/directory"); + SUPPORTED_MIME_TYPES.add("application/x-versit"); + SUPPORTED_MIME_TYPES.add("text/x-versit"); + SUPPORTED_MIME_TYPES.add("text/x-vcalendar"); } public vcfParser() { super("vCard Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL url, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { diff --git a/source/de/anomic/document/parser/vsdParser.java b/source/de/anomic/document/parser/vsdParser.java index 0bea160cc..e50604532 100644 --- a/source/de/anomic/document/parser/vsdParser.java +++ b/source/de/anomic/document/parser/vsdParser.java @@ -28,7 +28,9 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import de.anomic.document.AbstractParser; import de.anomic.document.Idiom; import de.anomic.document.ParserException; @@ -43,29 +45,32 @@ public class vsdParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - SUPPORTED_MIME_TYPES.put("application/visio","vsd"); - SUPPORTED_MIME_TYPES.put("application/x-visio","vsd"); - SUPPORTED_MIME_TYPES.put("application/vnd.visio","vsd"); - SUPPORTED_MIME_TYPES.put("application/visio.drawing","vsd"); - SUPPORTED_MIME_TYPES.put("application/vsd","vsd"); - SUPPORTED_MIME_TYPES.put("application/x-vsd","vsd"); - SUPPORTED_MIME_TYPES.put("image/x-vsd","vsd"); - SUPPORTED_MIME_TYPES.put("zz-application/zz-winassoc-vsd","vsd"); + SUPPORTED_EXTENSIONS.add("vsd"); + SUPPORTED_MIME_TYPES.add("application/visio"); + SUPPORTED_MIME_TYPES.add("application/x-visio"); + SUPPORTED_MIME_TYPES.add("application/vnd.visio"); + SUPPORTED_MIME_TYPES.add("application/visio.drawing"); + SUPPORTED_MIME_TYPES.add("application/vsd"); + SUPPORTED_MIME_TYPES.add("application/x-vsd"); + SUPPORTED_MIME_TYPES.add("image/x-vsd"); + SUPPORTED_MIME_TYPES.add("zz-application/zz-winassoc-vsd"); } public vsdParser() { super("Microsoft Visio Parser"); } - /** - * returns a hashtable containing the mimetypes that are supported by this class - */ - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } - + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + /* * parses the source documents and returns a plasmaParserDocument containing * all extracted information about the parsed document diff --git a/source/de/anomic/document/parser/xlsParser.java b/source/de/anomic/document/parser/xlsParser.java index 0330677e8..2cdb32ed1 100644 --- a/source/de/anomic/document/parser/xlsParser.java +++ b/source/de/anomic/document/parser/xlsParser.java @@ -28,7 +28,9 @@ package de.anomic.document.parser; import java.io.InputStream; -import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + import org.apache.poi.hssf.eventusermodel.HSSFEventFactory; import org.apache.poi.hssf.eventusermodel.HSSFListener; import org.apache.poi.hssf.eventusermodel.HSSFRequest; @@ -56,17 +58,19 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); static { - String ext = "xls,xlsx"; - SUPPORTED_MIME_TYPES.put("application/msexcel",ext); - SUPPORTED_MIME_TYPES.put("application/excel",ext); - SUPPORTED_MIME_TYPES.put("application/vnd.ms-excel",ext); - SUPPORTED_MIME_TYPES.put("application/x-excel",ext); - SUPPORTED_MIME_TYPES.put("application/x-msexcel",ext); - SUPPORTED_MIME_TYPES.put("application/x-ms-excel",ext); - SUPPORTED_MIME_TYPES.put("application/x-dos_ms_excel",ext); - SUPPORTED_MIME_TYPES.put("application/xls",ext); + SUPPORTED_EXTENSIONS.add("xls"); + SUPPORTED_EXTENSIONS.add("xlsx"); + SUPPORTED_MIME_TYPES.add("application/msexcel"); + SUPPORTED_MIME_TYPES.add("application/excel"); + SUPPORTED_MIME_TYPES.add("application/vnd.ms-excel"); + SUPPORTED_MIME_TYPES.add("application/x-excel"); + SUPPORTED_MIME_TYPES.add("application/x-msexcel"); + SUPPORTED_MIME_TYPES.add("application/x-ms-excel"); + SUPPORTED_MIME_TYPES.add("application/x-dos_ms_excel"); + SUPPORTED_MIME_TYPES.add("application/xls"); } public xlsParser(){ @@ -134,9 +138,13 @@ public class xlsParser extends AbstractParser implements Idiom, HSSFListener { } } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } @Override public void reset(){ diff --git a/source/de/anomic/document/parser/zipParser.java b/source/de/anomic/document/parser/zipParser.java index 29a2ac431..0a874e7e6 100644 --- a/source/de/anomic/document/parser/zipParser.java +++ b/source/de/anomic/document/parser/zipParser.java @@ -34,8 +34,10 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.Map; +import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -56,26 +58,31 @@ public class zipParser extends AbstractParser implements Idiom { * a list of mime types that are supported by this parser class * @see #getSupportedMimeTypes() */ - public static final HashMap SUPPORTED_MIME_TYPES = new HashMap(); - static { - SUPPORTED_MIME_TYPES.put("application/zip","zip"); - SUPPORTED_MIME_TYPES.put("application/x-zip","zip"); - SUPPORTED_MIME_TYPES.put("application/x-zip-compressed","zip"); - SUPPORTED_MIME_TYPES.put("application/octet-stream","zip"); - SUPPORTED_MIME_TYPES.put("application/x-compress","zip"); - SUPPORTED_MIME_TYPES.put("application/x-compressed","zip"); - SUPPORTED_MIME_TYPES.put("multipart/x-zip","zip"); - SUPPORTED_MIME_TYPES.put("application/java-archive","jar"); + public static final Set SUPPORTED_MIME_TYPES = new HashSet(); + public static final Set SUPPORTED_EXTENSIONS = new HashSet(); + static { + SUPPORTED_EXTENSIONS.add("zip"); + SUPPORTED_MIME_TYPES.add("application/zip"); + SUPPORTED_MIME_TYPES.add("application/x-zip"); + SUPPORTED_MIME_TYPES.add("application/x-zip-compressed"); + SUPPORTED_MIME_TYPES.add("application/x-compress"); + SUPPORTED_MIME_TYPES.add("application/x-compressed"); + SUPPORTED_MIME_TYPES.add("multipart/x-zip"); + SUPPORTED_MIME_TYPES.add("application/java-archive"); } public zipParser() { - super("Compressed Archive File Parser"); + super("ZIP File Parser"); } - public HashMap getSupportedMimeTypes() { + public Set supportedMimeTypes() { return SUPPORTED_MIME_TYPES; } + public Set supportedExtensions() { + return SUPPORTED_EXTENSIONS; + } + public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { long docTextLength = 0; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index e590182c2..33c793693 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -528,7 +528,7 @@ public final class httpdProxyHandler { final String storeError = cacheEntry.shallStoreCacheForProxy(); final boolean storeHTCache = cacheEntry.profile().storeHTCache(); - final boolean isSupportedContent = Parser.supportsExtension(cacheEntry.url()) && Parser.supportsMime(cacheEntry.getMimeType()); + final String supportError = Parser.supports(cacheEntry.url(), cacheEntry.getMimeType()); if ( /* * Now we store the response into the htcache directory if @@ -539,7 +539,7 @@ public final class httpdProxyHandler { * b) the user has configured to use the htcache OR * c) the content should be indexed */ - ((storeHTCache) || (isSupportedContent)) + ((storeHTCache) || (supportError != null)) ) { // we don't write actually into a file, only to RAM, and schedule writing the file. int l = res.getResponseHeader().size(); @@ -580,7 +580,7 @@ public final class httpdProxyHandler { if (theLogger.isFine()) theLogger.logFine(reqID +" "+ url.toString() + " not cached." + " StoreError=" + ((storeError==null)?"None":storeError) + " StoreHTCache=" + storeHTCache + - " SupportetContent=" + isSupportedContent); + " SupportError=" + supportError); FileUtils.copy(res.getDataAsStream(), outStream); diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 66c8bfd6f..8da80776c 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -42,7 +42,6 @@ import java.util.HashMap; import java.util.Map; import de.anomic.document.Classification; -import de.anomic.document.Parser; import de.anomic.http.httpResponseHeader; import de.anomic.http.httpDocument; import de.anomic.kelondro.blob.ArrayStack; @@ -181,10 +180,6 @@ public final class plasmaHTCache { return mimeType.toUpperCase().startsWith("IMAGE"); } - public static boolean isText(final String mimeType) { - return Parser.supportsMime(mimeType); - } - public static boolean noIndexingURL(final yacyURL url) { if (url == null) return false; String urlString = url.toString().toLowerCase(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d0d8ed50d..37fb4186c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1087,8 +1087,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch