fixed bugs in parser and ftp client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7360 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 4e2c14efbb
parent d78e322e84
commit 4e2c14efbb
11 changed files with 94 additions and 64 deletions
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -140,6 +140,7 @@ public class FTPLoader {
                    response = getFile(ftpClient, request, acceptOnlyParseable);
                } catch (final Exception e) {
                    // add message to errorLog
+                    e.printStackTrace();
                    (new PrintStream(berr)).print(e.getMessage());
                }
            }
@ -149,9 +150,9 @@ public class FTPLoader {
        // pass the downloaded resource to the cache manager
        if (berr.size() > 0 || response == null) {
            // some error logging
-            final String detail = (berr.size() > 0) ? "\n    Errorlog: " + berr.toString() : "";
-            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server download" + detail);
-            throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail);
+            final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
+            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
+            throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
        }
        
        Latency.update(request.url(), System.currentTimeMillis() - start);
@ -207,7 +208,7 @@ public class FTPLoader {
        return true;
    }

-    private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception {
+    private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws IOException {
        // determine the mimetype of the resource
        final DigestURI url = request.url();
        final String mime = TextParser.mimeOf(url);
@ -218,7 +219,10 @@ public class FTPLoader {
        
        // create response header
        RequestHeader requestHeader = new RequestHeader();
-        if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
+        if (request.referrerhash() != null) {
+            DigestURI refurl = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+            if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false));
+        }
        ResponseHeader responseHeader = new ResponseHeader();
        responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate));
        responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@ -1779,7 +1779,7 @@ public final class Switchboard extends serverSwitch {
                return null;
            }
        }
-        
+        assert b != null;
        try {
            // parse the document
            documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
--- a/source/net/yacy/cora/protocol/ftp/FTPClient.java
+++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java
@ -2294,7 +2294,7 @@ public class FTPClient {
                // boolean success = !isNotPositiveCompletion(reply);
            } finally {
                // shutdown connection
-                if(ClientStream != null) {
+                if (ClientStream != null) {
                    ClientStream.close();
                }
                closeDataSocket();
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -132,6 +132,7 @@ public final class Condenser {
        
        Map.Entry<MultiProtocolURI, String> entry;
        if (indexText) {
+            assert document.getText() != null : document.dc_identifier();
            createCondensement(document.getText(), meaningLib);
            // the phrase counter:
            // phrase   0 are words taken from the URL
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -77,7 +77,6 @@ public class Document {
    private Map<String, String> emaillinks;
    private MultiProtocolURI favicon;
    private boolean resorted;
-    private InputStream textStream;
    private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
    private Set<String> languages;
    private boolean indexingDenied;
@ -228,15 +227,19 @@ dc_rights
    public InputStream getText() {
        try {
            if (this.text == null) return new ByteArrayInputStream("".getBytes());
-
-            if (this.text instanceof File) {
-                this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
+            if (this.text instanceof String) {
+                return new ByteArrayInputStream(((String) this.text).getBytes("UTF-8"));
+            } else if (this.text instanceof InputStream) {
+                return (InputStream) this.text;
+            } else if (this.text instanceof File) {
+                return new BufferedInputStream(new FileInputStream((File)this.text));
            } else if (this.text instanceof byte[]) {
-                this.textStream =  new ByteArrayInputStream((byte[]) this.text);
+                return new ByteArrayInputStream((byte[]) this.text);
            } else if (this.text instanceof ByteArrayOutputStream) {
-                this.textStream =  new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
+                return new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
            }
-            return this.textStream;
+            assert false : this.text.getClass().toString();
+            return null;
        } catch (final Exception e) {
            Log.logException(e);
        }
@ -246,28 +249,44 @@ dc_rights
    public byte[] getTextBytes() {
        try {
            if (this.text == null) return new byte[0];
-
-            if (this.text instanceof File) {
-                return FileUtils.read((File)this.text);
+            if (this.text instanceof String) {
+                return ((String) this.text).getBytes("UTF-8");
+            } else if (this.text instanceof InputStream) {
+                return FileUtils.read((InputStream) this.text);
+            } else if (this.text instanceof File) {
+                return FileUtils.read((File) this.text);
            } else if (this.text instanceof byte[]) {
-                return (byte[])this.text;
+                return (byte[]) this.text;
            } else if (this.text instanceof ByteArrayOutputStream) {
                return ((ByteArrayOutputStream) this.text).toByteArray();
            }
+            assert false : this.text.getClass().toString();
+            return null;
        } catch (final Exception e) {
            Log.logException(e);
        }
-        return new byte[0];             
+        return new byte[0];
    }
    
    public long getTextLength() {
-        if (this.text == null) return 0;
-        if (this.text instanceof File) return ((File) this.text).length();
-        else if (this.text instanceof byte[]) return ((byte[]) this.text).length;
-        else if (this.text instanceof ByteArrayOutputStream) {
-            return ((ByteArrayOutputStream)this.text).size();
+        try {
+            if (this.text == null) return -1;
+            if (this.text instanceof String) {
+                return ((String) this.text).length();
+            } else if (this.text instanceof InputStream) {
+                return ((InputStream) this.text).available();
+            } else if (this.text instanceof File) {
+                return ((File) this.text).length();
+            } else if (this.text instanceof byte[]) {
+                return ((byte[]) this.text).length;
+            } else if (this.text instanceof ByteArrayOutputStream) {
+                return ((ByteArrayOutputStream) this.text).size();
+            }
+            assert false : this.text.getClass().toString();
+            return -1;
+        } catch (final Exception e) {
+            Log.logException(e);
        }
-        
        return -1; 
    }
    
@ -590,27 +609,21 @@ dc_rights
    }
    
    public void close() {
+        if (this.text == null) return;
+        
        // try close the output stream
-        if (this.textStream != null) {
-            try {
-                this.textStream.close();
-            } catch (final Exception e) { 
-                /* ignore this */
-            } finally {
-                this.textStream = null;
-            }
+        if (this.text instanceof InputStream) try {
+            ((InputStream) this.text).close();
+        } catch (final Exception e) {} finally {
+            this.text = null;
        }
        
        // delete the temp file
-        if ((this.text != null) && (this.text instanceof File)) {
-            try { 
-                FileUtils.deletedelete((File) this.text); 
-            } catch (final Exception e) {
-                /* ignore this */
-            } finally {
-                this.text = null;
-            }
-        }        
+        if (this.text instanceof File) try { 
+            FileUtils.deletedelete((File) this.text); 
+        } catch (final Exception e) {} finally {
+            this.text = null;
+        }
    }
    
    /**
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -140,6 +140,7 @@ public final class TextParser {
        ) throws InterruptedException, Parser.Failure {

        BufferedInputStream sourceStream = null;
+        Document[] docs = null;
        try {
            if (log.isFine()) log.logFine("Parsing '" + location + "' from file");
            if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) {
@ -148,17 +149,17 @@ public final class TextParser {
                throw new Parser.Failure(errorMsg, location);
            }
            sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
+            docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
        } catch (final Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
            log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
            throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location);
        } finally {
-            if (sourceStream != null)try {
-                sourceStream.close();
-            } catch (final Exception ex) {}
+            if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
        }
+        for (Document d: docs) { assert d.getText() != null; } // verify docs
+        return docs;
    }
    
    public static Document[] parseSource(
@ -193,7 +194,9 @@ public final class TextParser {
        // then we use only one stream-oriented parser.
        if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
            // use a specific stream-oriented parser
-            return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            for (Document d: docs) { assert d.getText() != null; } // verify docs
+            return docs;
        }
        
        // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -204,7 +207,9 @@ public final class TextParser {
        } catch (IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        return parseSource(location, mimeType, idioms, charset, b);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, b);
+        for (Document d: docs) { assert d.getText() != null; } // verify docs
+        return docs;
    }

    private static Document[] parseSource(
@ -222,7 +227,9 @@ public final class TextParser {

        if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
        try {
-            return parser.parse(location, mimeType, documentCharset, sourceStream);
+            Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
+            for (Document d: docs) { assert d.getText() != null; } // verify docs
+            return docs;
        } catch (Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
        }
@ -240,11 +247,11 @@ public final class TextParser {
        final String documentCharset = htmlParser.patchCharsetEncoding(charset);
        assert !parsers.isEmpty();

-        Document[] doc = null;
+        Document[] docs = null;
        HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
        for (Parser parser: parsers) {
            try {
-                doc = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
+                docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
            } catch (Parser.Failure e) {
                failedParser.put(parser, e);
                //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@ -252,10 +259,10 @@ public final class TextParser {
                failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
                //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
            }
-            if (doc != null) break;
+            if (docs != null) break;
        }
        
-        if (doc == null) {
+        if (docs == null) {
            if (failedParser.size() == 0) {
                final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
                //log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
@ -269,7 +276,8 @@ public final class TextParser {
                throw new Parser.Failure("All parser failed: " + failedParsers, location);
            }
        }
-        return doc;
+        for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
+        return docs;
    }
    
    /**
@ -335,7 +343,6 @@ public final class TextParser {
        
        return idioms;
    }
-    
    public static String supportsMime(String mimeType) {
        if (mimeType == null) return null;
        mimeType = normalizeMimeType(mimeType);
@ -343,7 +350,7 @@ public final class TextParser {
        if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
        return null;
    }
-    
+
    public static String supportsExtension(final MultiProtocolURI url) {
        String ext = url.getFileExtension().toLowerCase();
        if (ext == null || ext.length() == 0) return null;
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@ -58,6 +58,7 @@ public class bzipParser extends AbstractParser implements Parser {
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
        
        File tempFile = null;
+        Document[] docs;
        try {           
            /*
             * First we have to consume the first two char from the stream. Otherwise
@ -90,7 +91,7 @@ public class bzipParser extends AbstractParser implements Parser {
            out.close();
            
            // creating a new parser class to parse the unzipped content
-            return TextParser.parseSource(location, null, null, tempFile);
+            docs = TextParser.parseSource(location, null, null, tempFile);
        } catch (final Exception e) {  
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -99,5 +100,6 @@ public class bzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
+        return docs;
    }
 }
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@ -40,7 +40,7 @@ public class genericParser extends AbstractParser implements Parser {
    
    public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {

-        return new Document[]{new Document(
+        Document[] docs = new Document[]{new Document(
                location,
                mimeType,
                charset,
@ -56,5 +56,7 @@ public class genericParser extends AbstractParser implements Parser {
                null,
                null,
                false)};
+        for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
+        return docs;
    }
 }
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@ -57,6 +57,7 @@ public class gzipParser extends AbstractParser implements Parser {
    public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
        
        File tempFile = null;
+        Document[] docs = null;
        try {           
            int read = 0;
            final byte[] data = new byte[1024];
@ -77,7 +78,7 @@ public class gzipParser extends AbstractParser implements Parser {
            out.close();
            
            // creating a new parser class to parse the unzipped content
-            return TextParser.parseSource(location,null,null,tempFile);
+            docs = TextParser.parseSource(location,null,null,tempFile);
        } catch (final Exception e) {    
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -86,6 +87,7 @@ public class gzipParser extends AbstractParser implements Parser {
        } finally {
            if (tempFile != null) FileUtils.deletedelete(tempFile);
        }
+        return docs;
    }
 
 }
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@ -80,14 +80,13 @@ public class tarParser extends AbstractParser implements Parser {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(tis, tmp, entry.getSize());
                    subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
+                    if (subDocs == null) continue;
+                    for (Document d: subDocs) docacc.add(d);
                } catch (final Parser.Failure e) {
                    log.logWarning("tar parser entry " + name + ": " + e.getMessage());
                } finally {
                    if (tmp != null) FileUtils.deletedelete(tmp);
                }
-                if (subDocs == null) continue;
-                
-                for (Document d: subDocs) docacc.add(d);
            } catch (IOException e) {
                log.logWarning("tar parser:" + e.getMessage());
                break;
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@ -75,13 +75,13 @@ public class zipParser extends AbstractParser implements Parser {
                    tmp = FileUtils.createTempFile(this.getClass(), name);
                    FileUtils.copy(zis, tmp, entry.getSize());  
                    docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp);
+                    if (docs == null) continue;
+                    for (Document d: docs) docacc.add(d);
                } catch (final Parser.Failure e) {
                    log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
                } finally {
                    if (tmp != null) FileUtils.deletedelete(tmp);
                }
-                if (docs == null) continue;
-                for (Document d: docs) docacc.add(d);
            } catch (IOException e) {
                log.logWarning("ZIP parser:" + e.getMessage());
                break;