|
|
|
@ -85,11 +85,11 @@ public final class TextParser {
|
|
|
|
|
private static final Parser genericXMLIdiom = new GenericXMLParser();
|
|
|
|
|
|
|
|
|
|
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
|
|
|
|
|
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
|
|
|
|
|
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
|
|
|
|
|
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
|
|
|
|
|
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
|
|
|
|
|
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
|
|
|
|
|
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<>();
|
|
|
|
|
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<>();
|
|
|
|
|
private static final Map<String, String> ext2mime = new ConcurrentHashMap<>();
|
|
|
|
|
private static final Map<String, Object> denyMime = new ConcurrentHashMap<>();
|
|
|
|
|
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<>();
|
|
|
|
|
|
|
|
|
|
static {
|
|
|
|
|
initParser(new apkParser());
|
|
|
|
@ -130,9 +130,9 @@ public final class TextParser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static Set<Parser> parsers() {
|
|
|
|
|
final Set<Parser> c = new HashSet<Parser>();
|
|
|
|
|
for (Set<Parser> pl: ext2parser.values()) c.addAll(pl);
|
|
|
|
|
for (Set<Parser> pl: mime2parser.values()) c.addAll(pl);
|
|
|
|
|
final Set<Parser> c = new HashSet<>();
|
|
|
|
|
for (final Set<Parser> pl: ext2parser.values()) c.addAll(pl);
|
|
|
|
|
for (final Set<Parser> pl: mime2parser.values()) c.addAll(pl);
|
|
|
|
|
return c;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
@ -153,7 +153,7 @@ public final class TextParser {
|
|
|
|
|
if (prototypeMime == null) prototypeMime = mimeType;
|
|
|
|
|
LinkedHashSet<Parser> p0 = mime2parser.get(mimeType);
|
|
|
|
|
if (p0 == null) {
|
|
|
|
|
p0 = new LinkedHashSet<Parser>();
|
|
|
|
|
p0 = new LinkedHashSet<>();
|
|
|
|
|
mime2parser.put(mimeType, p0);
|
|
|
|
|
}
|
|
|
|
|
p0.add(parser);
|
|
|
|
@ -172,7 +172,7 @@ public final class TextParser {
|
|
|
|
|
ext = ext.toLowerCase(Locale.ROOT);
|
|
|
|
|
LinkedHashSet<Parser> p0 = ext2parser.get(ext);
|
|
|
|
|
if (p0 == null) {
|
|
|
|
|
p0 = new LinkedHashSet<Parser>();
|
|
|
|
|
p0 = new LinkedHashSet<>();
|
|
|
|
|
ext2parser.put(ext, p0);
|
|
|
|
|
}
|
|
|
|
|
p0.add(parser);
|
|
|
|
@ -236,7 +236,7 @@ public final class TextParser {
|
|
|
|
|
}
|
|
|
|
|
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
|
|
|
|
|
|
|
|
|
|
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
|
|
|
|
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
|
|
|
|
|
|
|
|
|
return docs;
|
|
|
|
|
}
|
|
|
|
@ -258,7 +258,7 @@ public final class TextParser {
|
|
|
|
|
AbstractParser.log.fine("Parsing '" + location + "' from byte-array, applying only the generic parser");
|
|
|
|
|
}
|
|
|
|
|
mimeType = normalizeMimeType(mimeType);
|
|
|
|
|
Set<Parser> idioms = new HashSet<>();
|
|
|
|
|
final Set<Parser> idioms = new HashSet<>();
|
|
|
|
|
idioms.add(TextParser.genericIdiom);
|
|
|
|
|
|
|
|
|
|
return parseSource(location, mimeType, idioms, charset, ignoreClassNames, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
|
|
|
|
@ -294,7 +294,7 @@ public final class TextParser {
|
|
|
|
|
canStream = true;
|
|
|
|
|
} else if(idioms.size() == 2) {
|
|
|
|
|
/* When there are only 2 available parsers, stream oriented parsing can still be applied when one of the 2 parsers is the generic one */
|
|
|
|
|
for(Parser idiom : idioms) {
|
|
|
|
|
for(final Parser idiom : idioms) {
|
|
|
|
|
if(idiom instanceof genericParser) {
|
|
|
|
|
canStream = true;
|
|
|
|
|
}
|
|
|
|
@ -311,7 +311,7 @@ public final class TextParser {
|
|
|
|
|
try {
|
|
|
|
|
/* The size of the buffer on the stream must be large enough to allow parser implementations to start parsing the resource
|
|
|
|
|
* and eventually fail, but must also be larger than eventual parsers internal buffers such as BufferedInputStream.DEFAULT_BUFFER_SIZE (8192 bytes) */
|
|
|
|
|
int rewindSize = 10 * 1024;
|
|
|
|
|
final int rewindSize = 10 * 1024;
|
|
|
|
|
final InputStream markableStream;
|
|
|
|
|
if(sourceStream instanceof ByteArrayInputStream) {
|
|
|
|
|
/* No nead to use a wrapping buffered stream when the source is already entirely in memory.
|
|
|
|
@ -324,7 +324,7 @@ public final class TextParser {
|
|
|
|
|
markableStream.mark(rewindSize);
|
|
|
|
|
|
|
|
|
|
/* Loop on parser : they are supposed to be sorted in order to start with the most specific and end with the most generic */
|
|
|
|
|
for(Parser parser : idioms) {
|
|
|
|
|
for(final Parser parser : idioms) {
|
|
|
|
|
/* Wrap in a CloseShieldInputStream to prevent SAX parsers closing the sourceStream
|
|
|
|
|
* and so let us eventually reuse the same opened stream with other parsers on parser failure */
|
|
|
|
|
CloseShieldInputStream nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
|
|
|
@ -332,7 +332,7 @@ public final class TextParser {
|
|
|
|
|
try {
|
|
|
|
|
return parseSource(location, mimeType, parser, charset, ignore_class_name, scraper, timezoneOffset,
|
|
|
|
|
nonCloseInputStream, maxLinks, maxBytes);
|
|
|
|
|
} catch (Parser.Failure e) {
|
|
|
|
|
} catch (final Parser.Failure e) {
|
|
|
|
|
/* Try to reset the marked stream. If the failed parser has consumed too many bytes :
|
|
|
|
|
* too bad, the marks is invalid and process fails now with an IOException */
|
|
|
|
|
markableStream.reset();
|
|
|
|
@ -346,28 +346,28 @@ public final class TextParser {
|
|
|
|
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
|
|
|
|
* that's why the gzipparser fails opening the stream.
|
|
|
|
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
|
|
|
|
gzipParser gzParser = (gzipParser)parser;
|
|
|
|
|
final gzipParser gzParser = (gzipParser)parser;
|
|
|
|
|
|
|
|
|
|
nonCloseInputStream = new CloseShieldInputStream(markableStream);
|
|
|
|
|
|
|
|
|
|
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
|
|
|
final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
Document[] docs = gzParser.parseCompressedInputStream(location,
|
|
|
|
|
final Document[] docs = gzParser.parseCompressedInputStream(location,
|
|
|
|
|
charset, timezoneOffset, depth,
|
|
|
|
|
nonCloseInputStream, maxLinks, maxBytes);
|
|
|
|
|
if (docs != null) {
|
|
|
|
|
maindoc.addSubDocuments(docs);
|
|
|
|
|
}
|
|
|
|
|
return new Document[] { maindoc };
|
|
|
|
|
} catch(Exception e1) {
|
|
|
|
|
} catch(final Exception e1) {
|
|
|
|
|
/* Try again to reset the marked stream if the failed parser has not consumed too many bytes */
|
|
|
|
|
markableStream.reset();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
} catch (final IOException e) {
|
|
|
|
|
throw new Parser.Failure("Error reading source", location);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -392,7 +392,7 @@ public final class TextParser {
|
|
|
|
|
} catch (final IOException e) {
|
|
|
|
|
throw new Parser.Failure(e.getMessage(), location);
|
|
|
|
|
}
|
|
|
|
|
Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
|
|
|
|
|
final Document[] docs = parseSource(location, mimeType, idioms, charset, ignore_class_name, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
|
|
|
|
|
|
|
|
|
|
return docs;
|
|
|
|
|
}
|
|
|
|
@ -494,11 +494,11 @@ public final class TextParser {
|
|
|
|
|
docs = parser.parseWithLimits(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
|
|
|
|
|
} else {
|
|
|
|
|
/* Parser do not support partial parsing within limits : let's control it here*/
|
|
|
|
|
InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
|
|
|
|
|
final InputStream limitedSource = new StrictLimitInputStream(sourceStream, maxBytes);
|
|
|
|
|
docs = parser.parse(location, mimeType, documentCharset, ignore_class_name, scraper, timezoneOffset, limitedSource);
|
|
|
|
|
}
|
|
|
|
|
return docs;
|
|
|
|
|
} catch(Parser.Failure e) {
|
|
|
|
|
} catch(final Parser.Failure e) {
|
|
|
|
|
throw e;
|
|
|
|
|
} catch (final Exception e) {
|
|
|
|
|
throw new Parser.Failure("parser failed: " + parser.getName(), location);
|
|
|
|
@ -538,8 +538,8 @@ public final class TextParser {
|
|
|
|
|
assert !parsers.isEmpty();
|
|
|
|
|
|
|
|
|
|
Document[] docs = null;
|
|
|
|
|
final Map<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
|
|
|
|
|
String origName = Thread.currentThread().getName();
|
|
|
|
|
final Map<Parser, Parser.Failure> failedParser = new HashMap<>();
|
|
|
|
|
final String origName = Thread.currentThread().getName();
|
|
|
|
|
Thread.currentThread().setName("parsing + " + location.toString()); // set a name to get the address in Thread Dump
|
|
|
|
|
for (final Parser parser: parsers) {
|
|
|
|
|
if (MemoryControl.request(sourceArray.length * 6, false)) {
|
|
|
|
@ -570,11 +570,11 @@ public final class TextParser {
|
|
|
|
|
* In that case our HTTP client (see GzipResponseInterceptor) is already uncompressing the stream on the fly,
|
|
|
|
|
* that's why the gzipparser fails opening the stream.
|
|
|
|
|
* (see RFC 7231 section 3.1.2.2 for "Content-Encoding" header specification https://tools.ietf.org/html/rfc7231#section-3.1.2.2)*/
|
|
|
|
|
gzipParser gzParser = (gzipParser)parser;
|
|
|
|
|
final gzipParser gzParser = (gzipParser)parser;
|
|
|
|
|
|
|
|
|
|
bis = new ByteArrayInputStream(sourceArray);
|
|
|
|
|
|
|
|
|
|
Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
|
|
|
final Document maindoc = gzipParser.createMainDocument(location, mimeType, charset, gzParser);
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
docs = gzParser.parseCompressedInputStream(location,
|
|
|
|
@ -585,9 +585,9 @@ public final class TextParser {
|
|
|
|
|
}
|
|
|
|
|
docs = new Document[] { maindoc };
|
|
|
|
|
break;
|
|
|
|
|
} catch(Parser.Failure e1) {
|
|
|
|
|
} catch(final Parser.Failure e1) {
|
|
|
|
|
failedParser.put(parser, e1);
|
|
|
|
|
} catch(Exception e2) {
|
|
|
|
|
} catch(final Exception e2) {
|
|
|
|
|
failedParser.put(parser, new Parser.Failure(e2.getMessage(), location));
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
@ -599,7 +599,7 @@ public final class TextParser {
|
|
|
|
|
} finally {
|
|
|
|
|
try {
|
|
|
|
|
bis.close();
|
|
|
|
|
} catch(IOException ioe) {
|
|
|
|
|
} catch(final IOException ioe) {
|
|
|
|
|
// Ignore.
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -622,14 +622,14 @@ public final class TextParser {
|
|
|
|
|
throw new Parser.Failure("All parser failed: " + failedParsers, location);
|
|
|
|
|
}
|
|
|
|
|
for (final Document d: docs) {
|
|
|
|
|
InputStream textStream = d.getTextStream();
|
|
|
|
|
final InputStream textStream = d.getTextStream();
|
|
|
|
|
assert textStream != null : "mimeType = " + mimeType;
|
|
|
|
|
try {
|
|
|
|
|
if(textStream != null) {
|
|
|
|
|
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
|
|
|
|
|
textStream.close();
|
|
|
|
|
}
|
|
|
|
|
} catch (IOException e) {
|
|
|
|
|
} catch (final IOException e) {
|
|
|
|
|
AbstractParser.log.warn("Could not close text input stream");
|
|
|
|
|
}
|
|
|
|
|
d.setDepth(depth);
|
|
|
|
@ -670,7 +670,7 @@ public final class TextParser {
|
|
|
|
|
* @throws Parser.Failure when the file extension or the MIME type is denied
|
|
|
|
|
*/
|
|
|
|
|
private static Set<Parser> parsers(final MultiProtocolURL url, String mimeType1) throws Parser.Failure {
|
|
|
|
|
final Set<Parser> idioms = new LinkedHashSet<Parser>(2); // LinkedSet to maintain order (genericParser should be last)
|
|
|
|
|
final Set<Parser> idioms = new LinkedHashSet<>(2); // LinkedSet to maintain order (genericParser should be last)
|
|
|
|
|
|
|
|
|
|
// check given mime type, place this first because this is the most likely to work and the best fit to the supplied mime
|
|
|
|
|
Set<Parser> idiom;
|
|
|
|
@ -682,7 +682,7 @@ public final class TextParser {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
|
|
|
|
|
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
|
|
|
|
|
final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
|
|
|
|
|
if (ext != null && ext.length() > 0) {
|
|
|
|
|
/* We do not throw here an exception when the media type is provided and inconsistent with the extension (if it is not supported an exception has already beeen thrown).
|
|
|
|
|
* Otherwise we would reject URLs with an apparently unsupported extension but whose actual Media Type is supported (for example text/html).
|
|
|
|
@ -818,7 +818,7 @@ public final class TextParser {
|
|
|
|
|
|
|
|
|
|
public static void setDenyExtension(final String denyList) {
|
|
|
|
|
denyExtensionx.clear();
|
|
|
|
|
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s, v);
|
|
|
|
|
for (final String s: CommonPattern.COMMA.split(denyList)) denyExtensionx.put(s.trim(), v);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String getDenyExtension() {
|
|
|
|
|