added iso,apk,dmg to extension-deny list

see also https://github.com/yacy/yacy_search_server/issues/510 zip is not on the list because it can be parsed
2 years ago · d49f937b98
parent 761dbdf06d
commit d49f937b98
2 changed files with 239 additions and 239 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -328,7 +328,7 @@ releases = DATA/RELEASE
 # the following mime-types are a blacklist for indexing:
 # parser.mime.deny: specifies mime-types that shall not be indexed
 parser.mime.deny=
-parser.extensions.deny=
+parser.extensions.deny=iso,apk,dmg
 # The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
 # Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install 
 parser.enableAudioTags=false
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -85,11 +85,11 @@ public final class TextParser {
    private static final Parser genericXMLIdiom = new GenericXMLParser();

    //use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
-    private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
-    private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
-    private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
-    private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
-    private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
+    private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<>();
+    private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<>();
+    private static final Map<String, String> ext2mime = new ConcurrentHashMap<>();
+    private static final Map<String, Object> denyMime = new ConcurrentHashMap<>();
+    private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<>();

    static {
        initParser(new apkParser());
@ -130,9 +130,9 @@ public final class TextParser {
    }

    public static Set<Parser> parsers() {
-        final Set<Parser> c = new HashSet<Parser>();
-        for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
-        for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
+        final Set<Parser> c = new HashSet<>();
+        for (final Set<Parser> pl: ext2parser.values()) c.addAll(pl);
+        for (final Set<Parser> pl: mime2parser.values()) c.addAll(pl);
        return c;
    }

@ -153,7 +153,7 @@ public final class TextParser {
            if (prototypeMime == null) prototypeMime = mimeType;
            LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
            if (p0 == null) {
-                p0 = new LinkedHashSet<Parser>();
+                p0 = new LinkedHashSet<>();
                mime2parser.put(mimeType, p0);
            }
            p0.add(parser);
@ -172,7 +172,7 @@ public final class TextParser {
            ext = ext.toLowerCase(Locale.ROOT);
            LinkedHashSet<Parser> p0 = ext2parser.get(ext);
            if (p0 == null) {
-                p0 = new LinkedHashSet<Parser>();
+                p0 = new LinkedHashSet<>();
                ext2parser.put(ext, p0);
            }
            p0.add(parser);
@ -236,7 +236,7 @@ public final class TextParser {
        }
        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);

-        Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);

        return docs;
    }
@ -258,7 +258,7 @@ public final class TextParser {
            AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
        }
        mimeType = normalizeMimeType(mimeType);
-        Set<Parser> idioms = new HashSet<>();
+        final Set<Parser> idioms = new HashSet<>();
        idioms.add(TextParser.genericIdiom);

        return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
@ -294,7 +294,7 @@ public final class TextParser {
            canStream = true;
        } else if(idioms.size() == 2) {
            /* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
-        	for(Parser idiom : idioms) {
+            for(final Parser idiom : idioms) {
                if(idiom instanceof genericParser) {
                    canStream = true;
                }
@ -311,7 +311,7 @@ public final class TextParser {
            try {
                /* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
                 * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
-				int rewindSize = 10 * 1024;
+                final int rewindSize = 10 * 1024;
                final InputStream markableStream;
                if(sourceStream instanceof ByteArrayInputStream) {
                    /* No nead to use a wrapping buffered stream when the source is already entirely in memory.
@ -324,7 +324,7 @@ public final class TextParser {
                markableStream.mark(rewindSize);

                /* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
-				for(Parser parser : idioms) {
+                for(final Parser parser : idioms) {
                    /* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
                     * and so let us eventually reuse the same opened stream with other parsers on parser failure */
                    CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
@ -332,7 +332,7 @@ public final class TextParser {
                    try {
                        return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
                                nonCloseInputStream, maxLinks, maxBytes);
-					} catch (Parser.Failure e) {
+                    } catch (final Parser.Failure e) {
                        /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
                         * too bad, the marks is invalid and process fails now with an IOException */
                        markableStream.reset();
@ -346,28 +346,28 @@ public final class TextParser {
                             * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
                             * that's why the gzipparser fails opening the stream.
                             * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
-							gzipParser gzParser = (gzipParser)parser; 
+                            final gzipParser gzParser = (gzipParser)parser;

                            nonCloseInputStream = new CloseShieldInputStream(markableStream);

-							Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
+                            final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);

                            try {
-								Document[] docs = gzParser.parseCompressedInputStream(location,
+                                final Document[] docs = gzParser.parseCompressedInputStream(location,
                                        charset, timezoneOffset, depth,
                                        nonCloseInputStream, maxLinks, maxBytes);
                                if (docs != null) {
                                    maindoc.addSubDocuments(docs);
                                }
                                return new Document[] { maindoc };
-							} catch(Exception e1) {
+                            } catch(final Exception e1) {
                                /* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
                                markableStream.reset();
                            }
                        }
                    }
                }
-			} catch (IOException e) {
+            } catch (final IOException e) {
                throw new Parser.Failure("Error reading source", location);
            }
        }
@ -392,7 +392,7 @@ public final class TextParser {
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
+        final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);

        return docs;
    }
@ -494,11 +494,11 @@ public final class TextParser {
                docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
            } else {
                /* Parser do not support partial parsing within limits : let's control it here*/
-    			InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
+                final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
                docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
            }
            return docs;
-        } catch(Parser.Failure e) {
+        } catch(final Parser.Failure e) {
            throw e;
        } catch (final Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -538,8 +538,8 @@ public final class TextParser {
        assert !parsers.isEmpty();

        Document[] docs = null;
-        final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
-        String origName = Thread.currentThread().getName();
+        final Map<Parser, Parser.Failure> failedParser = new HashMap<>();
+        final String origName = Thread.currentThread().getName();
        Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
        for (final Parser parser: parsers) {
            if (MemoryControl.request(sourceArray.length * 6, false)) {
@ -570,11 +570,11 @@ public final class TextParser {
                         * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
                         * that's why the gzipparser fails opening the stream.
                         * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
-						gzipParser gzParser = (gzipParser)parser;
+                        final gzipParser gzParser = (gzipParser)parser;

                        bis = new ByteArrayInputStream(sourceArray);

-						Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
+                        final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);

                        try {
                            docs = gzParser.parseCompressedInputStream(location,
@ -585,9 +585,9 @@ public final class TextParser {
                            }
                            docs = new Document[] { maindoc };
                            break;
-						} catch(Parser.Failure e1) {
+                        } catch(final Parser.Failure e1) {
                            failedParser.put(parser, e1);
-						} catch(Exception e2) {
+                        } catch(final Exception e2) {
                            failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
                        }
                    } else {
@ -599,7 +599,7 @@ public final class TextParser {
                } finally {
                    try {
                        bis.close();
-                	} catch(IOException ioe) {
+                    } catch(final IOException ioe) {
                        // Ignore.
                    }
                }
@ -622,14 +622,14 @@ public final class TextParser {
            throw new Parser.Failure("All parser failed: " + failedParsers, location);
        }
        for (final Document d: docs) {
-        	InputStream textStream = d.getTextStream();
+            final InputStream textStream = d.getTextStream();
            assert textStream != null : "mimeType = " + mimeType;
            try {
                if(textStream != null) {
                    /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
                    textStream.close();
                }
-			} catch (IOException e) {
+            } catch (final IOException e) {
                AbstractParser.log.warn("Could not close text input stream");
            }
            d.setDepth(depth);
@ -670,7 +670,7 @@ public final class TextParser {
     * @throws Parser.Failure when the file extension or the MIME type is denied
     */
    private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
-        final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
+        final Set<Parser> idioms = new LinkedHashSet<>(2); // LinkedSet to maintain order (genericParser should be last)

        // check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
        Set<Parser> idiom;
@ -682,7 +682,7 @@ public final class TextParser {
        }

        // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
-        String ext = MultiProtocolURL.getFileExtension(url.getFileName());
+        final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
        if (ext != null && ext.length() > 0) {
            /* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
             * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
@ -818,7 +818,7 @@ public final class TextParser {

    public static void setDenyExtension(final String denyList) {
        denyExtensionx.clear();
-        for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s, v);
+        for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s.trim(), v);
    }

    public static String getDenyExtension() {