less byte-arrays of response-content, less byte-array <-> stream conversation

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7856 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · ce248cc8dd
parent 59b767eebd
commit ce248cc8dd
3 changed files with 94 additions and 74 deletions
--- a/source/de/anomic/crawler/retrieval/Response.java
+++ b/source/de/anomic/crawler/retrieval/Response.java
@ -26,7 +26,6 @@

 package de.anomic.crawler.retrieval;

-import java.io.ByteArrayInputStream;
 import java.util.Date;

 import net.yacy.cora.date.GenericFormatter;
@ -63,7 +62,7 @@ public class Response {
    private final  RequestHeader      requestHeader;
    private final  ResponseHeader     responseHeader;
    private final  String             responseStatus;
-    private final  CrawlProfile profile;
+    private final  CrawlProfile       profile;
    private        byte[]             content;
    private        int                status;          // tracker indexing status, see status defs below
    
@ -824,7 +823,7 @@ public class Response {
        String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
        if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
        try {
-            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content.length, new ByteArrayInputStream(this.content));
+            return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
        } catch (Exception e) {
            return null;
        }
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -166,69 +166,11 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            return;
        }

-
-        /* ===========================================================================
-         * LOAD RESOURCE DATA
-         * =========================================================================== */
-        // if the snippet is not in the cache, we can try to get it from the htcache
-        final Response response;
-        try {
-            // first try to get the snippet from metadata
-            String loc;
-            final boolean noCacheUsage = url.isFile() || url.isSMB() || cacheStrategy == null;
-            if (containsAllHashes(loc = comp.dc_title(), queryhashes)) {
-                // try to create the snippet from information given in the url itself
-                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
-                return;
-            } else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
-                // try to create the snippet from information given in the creator metadata
-                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
-                return;
-            } else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
-                // try to create the snippet from information given in the subject metadata
-                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
-                return;
-            } else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
-                // try to create the snippet from information given in the url
-                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
-                return;
-            } else {
-                // try to load the resource from the cache
-                response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
-                if (response == null) {
-                    // in case that we did not get any result we can still return a success when we are not allowed to go online
-                    if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
-                        init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
-                        return;
-                    }
-
-                    // if it is still not available, report an error
-                    init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
-                    return;
-                } else {
-                    // place entry on indexing queue
-                    Switchboard.getSwitchboard().toIndexer(response);
-                    source = ResultClass.SOURCE_WEB;
-                }
-            }
-        } catch (final Exception e) {
-            //Log.logException(e);
-            init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
-            return;
-        }
-
-        /* ===========================================================================
-         * PARSE RESOURCE
-         * =========================================================================== */
-        Document document = null;
-        try {
-            document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
-        } catch (final Parser.Failure e) {
-            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
-            return;
-        }
+        Document document = loadDocument(loader, comp, queryhashes, cacheStrategy, url, reindexing, source);
        if (document == null) {
-            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+            if (this.error == null) {
+            	init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
+            }
            return;
        }

@ -281,6 +223,77 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
        document.close();
        init(url.hash(), snippetLine, source, null);
    }
+    
+    private Document loadDocument(
+    		final LoaderDispatcher loader,
+    		final URIMetadataRow.Components comp,
+    		final HandleSet queryhashes,
+    		final CacheStrategy cacheStrategy,
+    		final DigestURI url,
+    		final boolean reindexing,
+    		ResultClass source) {
+    	/* ===========================================================================
+         * LOAD RESOURCE DATA
+         * =========================================================================== */
+        // if the snippet is not in the cache, we can try to get it from the htcache
+        final Response response;
+        try {
+            // first try to get the snippet from metadata
+            String loc;
+            final boolean noCacheUsage = url.isFile() || url.isSMB() || cacheStrategy == null;
+            if (containsAllHashes(loc = comp.dc_title(), queryhashes)) {
+                // try to create the snippet from information given in the url itself
+                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
+                return null;
+            } else if (containsAllHashes(loc = comp.dc_creator(), queryhashes)) {
+                // try to create the snippet from information given in the creator metadata
+                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
+                return null;
+            } else if (containsAllHashes(loc = comp.dc_subject(), queryhashes)) {
+                // try to create the snippet from information given in the subject metadata
+                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
+                return null;
+            } else if (containsAllHashes(loc = comp.url().toNormalform(true, true).replace('-', ' '), queryhashes)) {
+                // try to create the snippet from information given in the url
+                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
+                return null;
+            } else {
+                // try to load the resource from the cache
+                response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, Integer.MAX_VALUE, true);
+                if (response == null) {
+                    // in case that we did not get any result we can still return a success when we are not allowed to go online
+                    if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
+                        init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
+                        return null;
+                    }
+
+                    // if it is still not available, report an error
+                    init(url.hash(), null, ResultClass.ERROR_RESOURCE_LOADING, "error loading resource from net, no cache entry");
+                    return null;
+                } else {
+                    // place entry on indexing queue
+                    Switchboard.getSwitchboard().toIndexer(response);
+                    source = ResultClass.SOURCE_WEB;
+                }
+            }
+        } catch (final Exception e) {
+            //Log.logException(e);
+            init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "error loading resource: " + e.getMessage());
+            return null;
+        }
+
+        /* ===========================================================================
+         * PARSE RESOURCE
+         * =========================================================================== */
+        Document document = null;
+        try {
+            document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
+        } catch (final Parser.Failure e) {
+            init(url.hash(), null, ResultClass.ERROR_PARSER_FAILED, e.getMessage()); // cannot be parsed
+            return null;
+        }
+        return document;
+    }

    private void init(final byte[] urlhash, final String line, final ResultClass errorCode, final String errortext) {
        this.urlhash = urlhash;
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -169,11 +169,23 @@ public final class TextParser {

    public static Document[] parseSource(
            final MultiProtocolURI location,
-            final String mimeType,
+            String mimeType,
            final String charset,
            final byte[] content
        ) throws Parser.Failure {
-        return parseSource(location, mimeType, charset, content.length, new ByteArrayInputStream(content));
+        if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
+        mimeType = normalizeMimeType(mimeType);
+        List<Parser> idioms = null;
+        try {
+            idioms = parsers(location, mimeType);
+        } catch (final Parser.Failure e) {
+            final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
+            log.logWarning(errorMsg);
+            throw new Parser.Failure(errorMsg, location);
+        }
+        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false);
+
+        return parseSource(location, mimeType, idioms, charset, content);
    }

    public static Document[] parseSource(
@ -199,9 +211,7 @@ public final class TextParser {
        // then we use only one stream-oriented parser.
        if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
            // use a specific stream-oriented parser
-            final Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
-            for (final Document d: docs) { assert d.getText() != null; } // verify docs
-            return docs;
+            return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
        }

        // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -212,9 +222,7 @@ public final class TextParser {
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        final Document[] docs = parseSource(location, mimeType, idioms, charset, b);
-        for (final Document d: docs) { assert d.getText() != null; } // verify docs
-        return docs;
+        return parseSource(location, mimeType, idioms, charset, b);
    }

    private static Document[] parseSource(
@ -254,7 +262,7 @@ public final class TextParser {

        Document[] docs = null;
        final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
-        if (MemoryControl.request(sourceArray.length * 2, false)) {
+        if (MemoryControl.request(sourceArray.length * 6, false)) {
            for (final Parser parser: parsers) {
                try {
                    docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));