added iso,apk,dmg to extension-deny list

see also https://github.com/yacy/yacy_search_server/issues/510
zip is not on the list because it can be parsed
pull/533/head
Michael Peter Christen 2 years ago
parent 761dbdf06d
commit d49f937b98

@ -328,7 +328,7 @@ releases = DATA/RELEASE
# the following mime-types are a blacklist for indexing: # the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed # parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny= parser.mime.deny=
parser.extensions.deny= parser.extensions.deny=iso,apk,dmg
# The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed # The audioTagParser is disabled by default as it needs to create a temporary file each time an audio resource is parsed
# Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install # Audio file extensions and media types can be enabled in the ConfigParser_p.html page if this is not a problem with your install
parser.enableAudioTags=false parser.enableAudioTags=false

@ -85,11 +85,11 @@ public final class TextParser {
private static final Parser genericXMLIdiom = new GenericXMLParser(); private static final Parser genericXMLIdiom = new GenericXMLParser();
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime //use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>(); private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<>();
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>(); private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<>();
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>(); private static final Map<String, String> ext2mime = new ConcurrentHashMap<>();
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyMime = new ConcurrentHashMap<>();
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>(); private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<>();
static { static {
initParser(new apkParser()); initParser(new apkParser());
@ -130,9 +130,9 @@ public final class TextParser {
} }
public static Set<Parser> parsers() { public static Set<Parser> parsers() {
final Set<Parser> c = new HashSet<Parser>(); final Set<Parser> c = new HashSet<>();
for (Set<Parser> pl: ext2parser.values()) c.addAll(pl); for (final Set<Parser> pl: ext2parser.values()) c.addAll(pl);
for (Set<Parser> pl: mime2parser.values()) c.addAll(pl); for (final Set<Parser> pl: mime2parser.values()) c.addAll(pl);
return c; return c;
} }
@ -153,7 +153,7 @@ public final class TextParser {
if (prototypeMime == null) prototypeMime = mimeType; if (prototypeMime == null) prototypeMime = mimeType;
LinkedHashSet<Parser> p0 = mime2parser.get(mimeType); LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
if (p0 == null) { if (p0 == null) {
p0 = new LinkedHashSet<Parser>(); p0 = new LinkedHashSet<>();
mime2parser.put(mimeType, p0); mime2parser.put(mimeType, p0);
} }
p0.add(parser); p0.add(parser);
@ -172,7 +172,7 @@ public final class TextParser {
ext = ext.toLowerCase(Locale.ROOT); ext = ext.toLowerCase(Locale.ROOT);
LinkedHashSet<Parser> p0 = ext2parser.get(ext); LinkedHashSet<Parser> p0 = ext2parser.get(ext);
if (p0 == null) { if (p0 == null) {
p0 = new LinkedHashSet<Parser>(); p0 = new LinkedHashSet<>();
ext2parser.put(ext, p0); ext2parser.put(ext, p0);
} }
p0.add(parser); p0.add(parser);
@ -236,7 +236,7 @@ public final class TextParser {
} }
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs; return docs;
} }
@ -258,7 +258,7 @@ public final class TextParser {
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser"); AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
} }
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
Set<Parser> idioms = new HashSet<>(); final Set<Parser> idioms = new HashSet<>();
idioms.add(TextParser.genericIdiom); idioms.add(TextParser.genericIdiom);
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
@ -294,7 +294,7 @@ public final class TextParser {
canStream = true; canStream = true;
} else if(idioms.size() == 2) { } else if(idioms.size() == 2) {
/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */ /* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
for(Parser idiom : idioms) { for(final Parser idiom : idioms) {
if(idiom instanceof genericParser) { if(idiom instanceof genericParser) {
canStream = true; canStream = true;
} }
@ -311,7 +311,7 @@ public final class TextParser {
try { try {
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource /* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */ * and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
int rewindSize = 10 * 1024; final int rewindSize = 10 * 1024;
final InputStream markableStream; final InputStream markableStream;
if(sourceStream instanceof ByteArrayInputStream) { if(sourceStream instanceof ByteArrayInputStream) {
/* No nead to use a wrapping buffered stream when the source is already entirely in memory. /* No nead to use a wrapping buffered stream when the source is already entirely in memory.
@ -324,7 +324,7 @@ public final class TextParser {
markableStream.mark(rewindSize); markableStream.mark(rewindSize);
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */ /* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
for(Parser parser : idioms) { for(final Parser parser : idioms) {
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream /* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
* and so let us eventually reuse the same opened stream with other parsers on parser failure */ * and so let us eventually reuse the same opened stream with other parsers on parser failure */
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream); CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
@ -332,7 +332,7 @@ public final class TextParser {
try { try {
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset, return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
nonCloseInputStream, maxLinks, maxBytes); nonCloseInputStream, maxLinks, maxBytes);
} catch (Parser.Failure e) { } catch (final Parser.Failure e) {
/* Try to reset the marked stream. If the failed parser has consumed too many bytes : /* Try to reset the marked stream. If the failed parser has consumed too many bytes :
* too bad, the marks is invalid and process fails now with an IOException */ * too bad, the marks is invalid and process fails now with an IOException */
markableStream.reset(); markableStream.reset();
@ -346,28 +346,28 @@ public final class TextParser {
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly, * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream. * that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser; final gzipParser gzParser = (gzipParser)parser;
nonCloseInputStream = new CloseShieldInputStream(markableStream); nonCloseInputStream = new CloseShieldInputStream(markableStream);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try { try {
Document[] docs = gzParser.parseCompressedInputStream(location, final Document[] docs = gzParser.parseCompressedInputStream(location,
charset, timezoneOffset, depth, charset, timezoneOffset, depth,
nonCloseInputStream, maxLinks, maxBytes); nonCloseInputStream, maxLinks, maxBytes);
if (docs != null) { if (docs != null) {
maindoc.addSubDocuments(docs); maindoc.addSubDocuments(docs);
} }
return new Document[] { maindoc }; return new Document[] { maindoc };
} catch(Exception e1) { } catch(final Exception e1) {
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */ /* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
markableStream.reset(); markableStream.reset();
} }
} }
} }
} }
} catch (IOException e) { } catch (final IOException e) {
throw new Parser.Failure("Error reading source", location); throw new Parser.Failure("Error reading source", location);
} }
} }
@ -392,7 +392,7 @@ public final class TextParser {
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs; return docs;
} }
@ -494,11 +494,11 @@ public final class TextParser {
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else { } else {
/* Parser do not support partial parsing within limits : let's control it here*/ /* Parser do not support partial parsing within limits : let's control it here*/
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes); final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource); docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
} }
return docs; return docs;
} catch(Parser.Failure e) { } catch(final Parser.Failure e) {
throw e; throw e;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -538,8 +538,8 @@ public final class TextParser {
assert !parsers.isEmpty(); assert !parsers.isEmpty();
Document[] docs = null; Document[] docs = null;
final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>(); final Map<Parser, Parser.Failure> failedParser = new HashMap<>();
String origName = Thread.currentThread().getName(); final String origName = Thread.currentThread().getName();
Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
for (final Parser parser: parsers) { for (final Parser parser: parsers) {
if (MemoryControl.request(sourceArray.length * 6, false)) { if (MemoryControl.request(sourceArray.length * 6, false)) {
@ -570,11 +570,11 @@ public final class TextParser {
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly, * In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
* that's why the gzipparser fails opening the stream. * that's why the gzipparser fails opening the stream.
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/ * (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
gzipParser gzParser = (gzipParser)parser; final gzipParser gzParser = (gzipParser)parser;
bis = new ByteArrayInputStream(sourceArray); bis = new ByteArrayInputStream(sourceArray);
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser); final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
try { try {
docs = gzParser.parseCompressedInputStream(location, docs = gzParser.parseCompressedInputStream(location,
@ -585,9 +585,9 @@ public final class TextParser {
} }
docs = new Document[] { maindoc }; docs = new Document[] { maindoc };
break; break;
} catch(Parser.Failure e1) { } catch(final Parser.Failure e1) {
failedParser.put(parser, e1); failedParser.put(parser, e1);
} catch(Exception e2) { } catch(final Exception e2) {
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location)); failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
} }
} else { } else {
@ -599,7 +599,7 @@ public final class TextParser {
} finally { } finally {
try { try {
bis.close(); bis.close();
} catch(IOException ioe) { } catch(final IOException ioe) {
// Ignore. // Ignore.
} }
} }
@ -622,14 +622,14 @@ public final class TextParser {
throw new Parser.Failure("All parser failed: " + failedParsers, location); throw new Parser.Failure("All parser failed: " + failedParsers, location);
} }
for (final Document d: docs) { for (final Document d: docs) {
InputStream textStream = d.getTextStream(); final InputStream textStream = d.getTextStream();
assert textStream != null : "mimeType = " + mimeType; assert textStream != null : "mimeType = " + mimeType;
try { try {
if(textStream != null) { if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close(); textStream.close();
} }
} catch (IOException e) { } catch (final IOException e) {
AbstractParser.log.warn("Could not close text input stream"); AbstractParser.log.warn("Could not close text input stream");
} }
d.setDepth(depth); d.setDepth(depth);
@ -670,7 +670,7 @@ public final class TextParser {
* @throws Parser.Failure when the file extension or the MIME type is denied * @throws Parser.Failure when the file extension or the MIME type is denied
*/ */
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure { private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last) final Set<Parser> idioms = new LinkedHashSet<>(2); // LinkedSet to maintain order (genericParser should be last)
// check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime // check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
Set<Parser> idiom; Set<Parser> idiom;
@ -682,7 +682,7 @@ public final class TextParser {
} }
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied) // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName()); final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) { if (ext != null && ext.length() > 0) {
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown). /* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html). * Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
@ -818,7 +818,7 @@ public final class TextParser {
public static void setDenyExtension(final String denyList) { public static void setDenyExtension(final String denyList) {
denyExtensionx.clear(); denyExtensionx.clear();
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s, v); for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s.trim(), v);
} }
public static String getDenyExtension() { public static String getDenyExtension() {

Loading…
Cancel
Save