diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 6809b499e..82c539fef 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -26,7 +26,6 @@ package de.anomic.crawler.retrieval; -import java.io.ByteArrayInputStream; import java.util.Date; import net.yacy.cora.date.GenericFormatter; @@ -63,7 +62,7 @@ public class Response { private final RequestHeader requestHeader; private final ResponseHeader responseHeader; private final String responseStatus; - private final CrawlProfile profile; + private final CrawlProfile profile; private byte[] content; private int status; // tracker indexing status, see status defs below @@ -824,7 +823,7 @@ public class Response { String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content.length, new ByteArrayInputStream(this.content)); + return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); } catch (Exception e) { return null; } diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index f8634a173..d68b7dcc1 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -166,69 +166,11 @@ public class TextSnippet implements Comparable, Comparator, Comparator idioms = null; + try { + idioms = parsers(location, mimeType); + } catch (final Parser.Failure e) { + final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage(); + log.logWarning(errorMsg); + throw new Parser.Failure(errorMsg, location); + } + assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false); + + return parseSource(location, mimeType, idioms, charset, content); } public static Document[] parseSource( @@ -199,9 +211,7 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - final Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); - for (final Document d: docs) { assert d.getText() != null; } // verify docs - return docs; + return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -212,9 +222,7 @@ public final class TextParser { } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - final Document[] docs = parseSource(location, mimeType, idioms, charset, b); - for (final Document d: docs) { assert d.getText() != null; } // verify docs - return docs; + return parseSource(location, mimeType, idioms, charset, b); } private static Document[] parseSource( @@ -254,7 +262,7 @@ public final class TextParser { Document[] docs = null; final HashMap failedParser = new HashMap(); - if (MemoryControl.request(sourceArray.length * 2, false)) { + if (MemoryControl.request(sourceArray.length * 6, false)) { for (final Parser parser: parsers) { try { docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));