From 4e2c14efbbea0d188e2b105a5afd706737312377 Mon Sep 17 00:00:00 2001
From: orbiter <orbiter@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Thu, 2 Dec 2010 11:05:04 +0000
Subject: [PATCH] fixed bugs in parser and ftp client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7360 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../anomic/crawler/retrieval/FTPLoader.java   | 14 ++--
 source/de/anomic/search/Switchboard.java      |  2 +-
 .../net/yacy/cora/protocol/ftp/FTPClient.java |  2 +-
 source/net/yacy/document/Condenser.java       |  1 +
 source/net/yacy/document/Document.java        | 83 +++++++++++--------
 source/net/yacy/document/TextParser.java      | 35 ++++----
 .../net/yacy/document/parser/bzipParser.java  |  4 +-
 .../yacy/document/parser/genericParser.java   |  4 +-
 .../net/yacy/document/parser/gzipParser.java  |  4 +-
 .../net/yacy/document/parser/tarParser.java   |  5 +-
 .../net/yacy/document/parser/zipParser.java   |  4 +-
 11 files changed, 94 insertions(+), 64 deletions(-)

diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java
index 1eda7c0a3..d19373b8a 100644
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@@ -140,6 +140,7 @@ public class FTPLoader {
                     response = getFile(ftpClient, request, acceptOnlyParseable);
                 } catch (final Exception e) {
                     // add message to errorLog
+                    e.printStackTrace();
                     (new PrintStream(berr)).print(e.getMessage());
                 }
             }
@@ -149,9 +150,9 @@ public class FTPLoader {
         // pass the downloaded resource to the cache manager
         if (berr.size() > 0 || response == null) {
             // some error logging
-            final String detail = (berr.size() > 0) ? "\n    Errorlog: " + berr.toString() : "";
-            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server download" + detail);
-            throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail);
+            final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
+            sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
+            throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
         }
         
         Latency.update(request.url(), System.currentTimeMillis() - start);
@@ -207,7 +208,7 @@ public class FTPLoader {
         return true;
     }
 
-    private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception {
+    private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws IOException {
         // determine the mimetype of the resource
         final DigestURI url = request.url();
         final String mime = TextParser.mimeOf(url);
@@ -218,7 +219,10 @@ public class FTPLoader {
         
         // create response header
         RequestHeader requestHeader = new RequestHeader();
-        if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
+        if (request.referrerhash() != null) {
+            DigestURI refurl = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+            if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false));
+        }
         ResponseHeader responseHeader = new ResponseHeader();
         responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate));
         responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java
index 253f5e577..a2e3c578d 100644
--- a/source/de/anomic/search/Switchboard.java
+++ b/source/de/anomic/search/Switchboard.java
@@ -1779,7 +1779,7 @@ public final class Switchboard extends serverSwitch {
                 return null;
             }
         }
-        
+        assert b != null;
         try {
             // parse the document
             documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);
diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java
index a3fe423b1..8fdf4478d 100644
--- a/source/net/yacy/cora/protocol/ftp/FTPClient.java
+++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java
@@ -2294,7 +2294,7 @@ public class FTPClient {
                 // boolean success = !isNotPositiveCompletion(reply);
             } finally {
                 // shutdown connection
-                if(ClientStream != null) {
+                if (ClientStream != null) {
                     ClientStream.close();
                 }
                 closeDataSocket();
diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java
index 446907af0..54ecb0237 100644
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@@ -132,6 +132,7 @@ public final class Condenser {
         
         Map.Entry<MultiProtocolURI, String> entry;
         if (indexText) {
+            assert document.getText() != null : document.dc_identifier();
             createCondensement(document.getText(), meaningLib);
             // the phrase counter:
             // phrase   0 are words taken from the URL
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 572cf0ecb..d6efdd64f 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -77,7 +77,6 @@ public class Document {
     private Map<String, String> emaillinks;
     private MultiProtocolURI favicon;
     private boolean resorted;
-    private InputStream textStream;
     private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
     private Set<String> languages;
     private boolean indexingDenied;
@@ -228,15 +227,19 @@ dc_rights
     public InputStream getText() {
         try {
             if (this.text == null) return new ByteArrayInputStream("".getBytes());
-
-            if (this.text instanceof File) {
-                this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
+            if (this.text instanceof String) {
+                return new ByteArrayInputStream(((String) this.text).getBytes("UTF-8"));
+            } else if (this.text instanceof InputStream) {
+                return (InputStream) this.text;
+            } else if (this.text instanceof File) {
+                return new BufferedInputStream(new FileInputStream((File)this.text));
             } else if (this.text instanceof byte[]) {
-                this.textStream =  new ByteArrayInputStream((byte[]) this.text);
+                return new ByteArrayInputStream((byte[]) this.text);
             } else if (this.text instanceof ByteArrayOutputStream) {
-                this.textStream =  new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
+                return new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
             }
-            return this.textStream;
+            assert false : this.text.getClass().toString();
+            return null;
         } catch (final Exception e) {
             Log.logException(e);
         }
@@ -246,28 +249,44 @@ dc_rights
     public byte[] getTextBytes() {
         try {
             if (this.text == null) return new byte[0];
-
-            if (this.text instanceof File) {
-                return FileUtils.read((File)this.text);
+            if (this.text instanceof String) {
+                return ((String) this.text).getBytes("UTF-8");
+            } else if (this.text instanceof InputStream) {
+                return FileUtils.read((InputStream) this.text);
+            } else if (this.text instanceof File) {
+                return FileUtils.read((File) this.text);
             } else if (this.text instanceof byte[]) {
-                return (byte[])this.text;
+                return (byte[]) this.text;
             } else if (this.text instanceof ByteArrayOutputStream) {
                 return ((ByteArrayOutputStream) this.text).toByteArray();
             }
+            assert false : this.text.getClass().toString();
+            return null;
         } catch (final Exception e) {
             Log.logException(e);
         }
-        return new byte[0];             
+        return new byte[0];
     }
     
     public long getTextLength() {
-        if (this.text == null) return 0;
-        if (this.text instanceof File) return ((File) this.text).length();
-        else if (this.text instanceof byte[]) return ((byte[]) this.text).length;
-        else if (this.text instanceof ByteArrayOutputStream) {
-            return ((ByteArrayOutputStream)this.text).size();
+        try {
+            if (this.text == null) return -1;
+            if (this.text instanceof String) {
+                return ((String) this.text).length();
+            } else if (this.text instanceof InputStream) {
+                return ((InputStream) this.text).available();
+            } else if (this.text instanceof File) {
+                return ((File) this.text).length();
+            } else if (this.text instanceof byte[]) {
+                return ((byte[]) this.text).length;
+            } else if (this.text instanceof ByteArrayOutputStream) {
+                return ((ByteArrayOutputStream) this.text).size();
+            }
+            assert false : this.text.getClass().toString();
+            return -1;
+        } catch (final Exception e) {
+            Log.logException(e);
         }
-        
         return -1; 
     }
     
@@ -590,27 +609,21 @@ dc_rights
     }
     
     public void close() {
+        if (this.text == null) return;
+        
         // try close the output stream
-        if (this.textStream != null) {
-            try {
-                this.textStream.close();
-            } catch (final Exception e) { 
-                /* ignore this */
-            } finally {
-                this.textStream = null;
-            }
+        if (this.text instanceof InputStream) try {
+            ((InputStream) this.text).close();
+        } catch (final Exception e) {} finally {
+            this.text = null;
         }
         
         // delete the temp file
-        if ((this.text != null) && (this.text instanceof File)) {
-            try { 
-                FileUtils.deletedelete((File) this.text); 
-            } catch (final Exception e) {
-                /* ignore this */
-            } finally {
-                this.text = null;
-            }
-        }        
+        if (this.text instanceof File) try { 
+            FileUtils.deletedelete((File) this.text); 
+        } catch (final Exception e) {} finally {
+            this.text = null;
+        }
     }
     
     /**
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index f63c72681..0a988cb0b 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -140,6 +140,7 @@ public final class TextParser {
         ) throws InterruptedException, Parser.Failure {
 
         BufferedInputStream sourceStream = null;
+        Document[] docs = null;
         try {
             if (log.isFine()) log.logFine("Parsing '" + location + "' from file");
             if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) {
@@ -148,17 +149,17 @@ public final class TextParser {
                 throw new Parser.Failure(errorMsg, location);
             }
             sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
-            return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
+            docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
         } catch (final Exception e) {
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof Parser.Failure) throw (Parser.Failure) e;
             log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
             throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location);
         } finally {
-            if (sourceStream != null)try {
-                sourceStream.close();
-            } catch (final Exception ex) {}
+            if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
         }
+        for (Document d: docs) { assert d.getText() != null; } // verify docs
+        return docs;
     }
     
     public static Document[] parseSource(
@@ -193,7 +194,9 @@ public final class TextParser {
         // then we use only one stream-oriented parser.
         if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
             // use a specific stream-oriented parser
-            return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
+            for (Document d: docs) { assert d.getText() != null; } // verify docs
+            return docs;
         }
         
         // in case that we know more parsers we first transform the content into a byte[] and use that as base
@@ -204,7 +207,9 @@ public final class TextParser {
         } catch (IOException e) {
             throw new Parser.Failure(e.getMessage(), location);
         }
-        return parseSource(location, mimeType, idioms, charset, b);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, b);
+        for (Document d: docs) { assert d.getText() != null; } // verify docs
+        return docs;
     }
 
     private static Document[] parseSource(
@@ -222,7 +227,9 @@ public final class TextParser {
 
         if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
         try {
-            return parser.parse(location, mimeType, documentCharset, sourceStream);
+            Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
+            for (Document d: docs) { assert d.getText() != null; } // verify docs
+            return docs;
         } catch (Exception e) {
             throw new Parser.Failure("parser failed: " + parser.getName(), location);
         }
@@ -240,11 +247,11 @@ public final class TextParser {
         final String documentCharset = htmlParser.patchCharsetEncoding(charset);
         assert !parsers.isEmpty();
 
-        Document[] doc = null;
+        Document[] docs = null;
         HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
         for (Parser parser: parsers) {
             try {
-                doc = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
+                docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
             } catch (Parser.Failure e) {
                 failedParser.put(parser, e);
                 //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@@ -252,10 +259,10 @@ public final class TextParser {
                 failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
                 //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
             }
-            if (doc != null) break;
+            if (docs != null) break;
         }
         
-        if (doc == null) {
+        if (docs == null) {
             if (failedParser.size() == 0) {
                 final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
                 //log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
@@ -269,7 +276,8 @@ public final class TextParser {
                 throw new Parser.Failure("All parser failed: " + failedParsers, location);
             }
         }
-        return doc;
+        for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
+        return docs;
     }
     
     /**
@@ -335,7 +343,6 @@ public final class TextParser {
         
         return idioms;
     }
-    
     public static String supportsMime(String mimeType) {
         if (mimeType == null) return null;
         mimeType = normalizeMimeType(mimeType);
@@ -343,7 +350,7 @@ public final class TextParser {
         if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
         return null;
     }
-    
+
     public static String supportsExtension(final MultiProtocolURI url) {
         String ext = url.getFileExtension().toLowerCase();
         if (ext == null || ext.length() == 0) return null;
diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java
index 2ef4454e0..271f80691 100644
--- a/source/net/yacy/document/parser/bzipParser.java
+++ b/source/net/yacy/document/parser/bzipParser.java
@@ -58,6 +58,7 @@ public class bzipParser extends AbstractParser implements Parser {
     public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
         
         File tempFile = null;
+        Document[] docs;
         try {           
             /*
              * First we have to consume the first two char from the stream. Otherwise
@@ -90,7 +91,7 @@ public class bzipParser extends AbstractParser implements Parser {
             out.close();
             
             // creating a new parser class to parse the unzipped content
-            return TextParser.parseSource(location, null, null, tempFile);
+            docs = TextParser.parseSource(location, null, null, tempFile);
         } catch (final Exception e) {  
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@@ -99,5 +100,6 @@ public class bzipParser extends AbstractParser implements Parser {
         } finally {
             if (tempFile != null) FileUtils.deletedelete(tempFile);
         }
+        return docs;
     }
 }
diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java
index eb0259603..8082d5778 100644
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@@ -40,7 +40,7 @@ public class genericParser extends AbstractParser implements Parser {
     
     public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {
 
-        return new Document[]{new Document(
+        Document[] docs = new Document[]{new Document(
                 location,
                 mimeType,
                 charset,
@@ -56,5 +56,7 @@ public class genericParser extends AbstractParser implements Parser {
                 null,
                 null,
                 false)};
+        for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
+        return docs;
     }
 }
diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java
index 4367437c9..f3452b6c3 100644
--- a/source/net/yacy/document/parser/gzipParser.java
+++ b/source/net/yacy/document/parser/gzipParser.java
@@ -57,6 +57,7 @@ public class gzipParser extends AbstractParser implements Parser {
     public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
         
         File tempFile = null;
+        Document[] docs = null;
         try {           
             int read = 0;
             final byte[] data = new byte[1024];
@@ -77,7 +78,7 @@ public class gzipParser extends AbstractParser implements Parser {
             out.close();
             
             // creating a new parser class to parse the unzipped content
-            return TextParser.parseSource(location,null,null,tempFile);
+            docs = TextParser.parseSource(location,null,null,tempFile);
         } catch (final Exception e) {    
             if (e instanceof InterruptedException) throw (InterruptedException) e;
             if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@@ -86,6 +87,7 @@ public class gzipParser extends AbstractParser implements Parser {
         } finally {
             if (tempFile != null) FileUtils.deletedelete(tempFile);
         }
+        return docs;
     }
  
 }
diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java
index 3bf0081da..ee20156b6 100644
--- a/source/net/yacy/document/parser/tarParser.java
+++ b/source/net/yacy/document/parser/tarParser.java
@@ -80,14 +80,13 @@ public class tarParser extends AbstractParser implements Parser {
                     tmp = FileUtils.createTempFile(this.getClass(), name);
                     FileUtils.copy(tis, tmp, entry.getSize());
                     subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
+                    if (subDocs == null) continue;
+                    for (Document d: subDocs) docacc.add(d);
                 } catch (final Parser.Failure e) {
                     log.logWarning("tar parser entry " + name + ": " + e.getMessage());
                 } finally {
                     if (tmp != null) FileUtils.deletedelete(tmp);
                 }
-                if (subDocs == null) continue;
-                
-                for (Document d: subDocs) docacc.add(d);
             } catch (IOException e) {
                 log.logWarning("tar parser:" + e.getMessage());
                 break;
diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java
index 548d52d77..e1fecc8c2 100644
--- a/source/net/yacy/document/parser/zipParser.java
+++ b/source/net/yacy/document/parser/zipParser.java
@@ -75,13 +75,13 @@ public class zipParser extends AbstractParser implements Parser {
                     tmp = FileUtils.createTempFile(this.getClass(), name);
                     FileUtils.copy(zis, tmp, entry.getSize());  
                     docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp);
+                    if (docs == null) continue;
+                    for (Document d: docs) docacc.add(d);
                 } catch (final Parser.Failure e) {
                     log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
                 } finally {
                     if (tmp != null) FileUtils.deletedelete(tmp);
                 }
-                if (docs == null) continue;
-                for (Document d: docs) docacc.add(d);
             } catch (IOException e) {
                 log.logWarning("ZIP parser:" + e.getMessage());
                 break;