diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index a5d980794..66f679f69 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -375,9 +375,10 @@ public final class TextParser { * because mime types returned by web severs are sometimes wrong, we also compute the mime type again * from the extension that can be extracted from the url path. That means that there are 3 criteria * that can be used to select a parser: - * - the given extension - * - the given mime type - * - the mime type computed from the extension + * - the given mime type (1.) + * - the extension of url (2.) + * - the mime type computed from the extension (3.) + * finally the generic parser is added as backup if all above fail * @param url the given url * @param mimeType the given mime type * @return a list of Idiom parsers that may be appropriate for the given criteria @@ -386,26 +387,30 @@ public final class TextParser { private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last) - // check extension - String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + // check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime Set<Parser> idiom; - if (ext != null && ext.length() > 0) { - if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); - idiom = ext2parser.get(ext); - if (idiom != null) idioms.addAll(idiom); - } - - // check given mime type if (mimeType1 != null) { mimeType1 = normalizeMimeType(mimeType1); if (denyMime.containsKey(mimeType1)) throw new Parser.Failure("mime type '" + mimeType1 + "' is denied (1)", url); idiom = mime2parser.get(mimeType1); - if (idiom != null && !idioms.contains(idiom)) idioms.addAll(idiom); + if (idiom != null) idioms.addAll(idiom); + } + + // check extension and add as backup (in case no, wrong or unknown/unsupported mime was suppied) + String ext = MultiProtocolURL.getFileExtension(url.getFileName()); + if (ext != null && ext.length() > 0) { + if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url); + idiom = ext2parser.get(ext); + if (idiom != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser + idioms.addAll(idiom); + } } // check mime type computed from extension final String mimeType2 = ext2mime.get(ext); - if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.addAll(idiom); + if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser + idioms.addAll(idiom); + } // always add the generic parser (make sure it is the last in access order) idioms.add(genericIdiom);