From 4e2c14efbbea0d188e2b105a5afd706737312377 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 2 Dec 2010 11:05:04 +0000 Subject: [PATCH] fixed bugs in parser and ftp client git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7360 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../anomic/crawler/retrieval/FTPLoader.java | 14 ++-- source/de/anomic/search/Switchboard.java | 2 +- .../net/yacy/cora/protocol/ftp/FTPClient.java | 2 +- source/net/yacy/document/Condenser.java | 1 + source/net/yacy/document/Document.java | 83 +++++++++++-------- source/net/yacy/document/TextParser.java | 35 ++++---- .../net/yacy/document/parser/bzipParser.java | 4 +- .../yacy/document/parser/genericParser.java | 4 +- .../net/yacy/document/parser/gzipParser.java | 4 +- .../net/yacy/document/parser/tarParser.java | 5 +- .../net/yacy/document/parser/zipParser.java | 4 +- 11 files changed, 94 insertions(+), 64 deletions(-) diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 1eda7c0a3..d19373b8a 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -140,6 +140,7 @@ public class FTPLoader { response = getFile(ftpClient, request, acceptOnlyParseable); } catch (final Exception e) { // add message to errorLog + e.printStackTrace(); (new PrintStream(berr)).print(e.getMessage()); } } @@ -149,9 +150,9 @@ public class FTPLoader { // pass the downloaded resource to the cache manager if (berr.size() > 0 || response == null) { // some error logging - final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : ""; - sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server download" + detail); - throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail); + final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : ""; + sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail); + throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail); } Latency.update(request.url(), System.currentTimeMillis() - start); @@ -207,7 +208,7 @@ public class FTPLoader { return true; } - private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception { + private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws IOException { // determine the mimetype of the resource final DigestURI url = request.url(); final String mime = TextParser.mimeOf(url); @@ -218,7 +219,10 @@ public class FTPLoader { // create response header RequestHeader requestHeader = new RequestHeader(); - if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); + if (request.referrerhash() != null) { + DigestURI refurl = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false)); + } ResponseHeader responseHeader = new ResponseHeader(); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 253f5e577..a2e3c578d 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1779,7 +1779,7 @@ public final class Switchboard extends serverSwitch { return null; } } - + assert b != null; try { // parse the document documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b); diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index a3fe423b1..8fdf4478d 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -2294,7 +2294,7 @@ public class FTPClient { // boolean success = !isNotPositiveCompletion(reply); } finally { // shutdown connection - if(ClientStream != null) { + if (ClientStream != null) { ClientStream.close(); } closeDataSocket(); diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 446907af0..54ecb0237 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -132,6 +132,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { + assert document.getText() != null : document.dc_identifier(); createCondensement(document.getText(), meaningLib); // the phrase counter: // phrase 0 are words taken from the URL diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 572cf0ecb..d6efdd64f 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -77,7 +77,6 @@ public class Document { private Map emaillinks; private MultiProtocolURI favicon; private boolean resorted; - private InputStream textStream; private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private Set languages; private boolean indexingDenied; @@ -228,15 +227,19 @@ dc_rights public InputStream getText() { try { if (this.text == null) return new ByteArrayInputStream("".getBytes()); - - if (this.text instanceof File) { - this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); + if (this.text instanceof String) { + return new ByteArrayInputStream(((String) this.text).getBytes("UTF-8")); + } else if (this.text instanceof InputStream) { + return (InputStream) this.text; + } else if (this.text instanceof File) { + return new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof byte[]) { - this.textStream = new ByteArrayInputStream((byte[]) this.text); + return new ByteArrayInputStream((byte[]) this.text); } else if (this.text instanceof ByteArrayOutputStream) { - this.textStream = new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray()); + return new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray()); } - return this.textStream; + assert false : this.text.getClass().toString(); + return null; } catch (final Exception e) { Log.logException(e); } @@ -246,28 +249,44 @@ dc_rights public byte[] getTextBytes() { try { if (this.text == null) return new byte[0]; - - if (this.text instanceof File) { - return FileUtils.read((File)this.text); + if (this.text instanceof String) { + return ((String) this.text).getBytes("UTF-8"); + } else if (this.text instanceof InputStream) { + return FileUtils.read((InputStream) this.text); + } else if (this.text instanceof File) { + return FileUtils.read((File) this.text); } else if (this.text instanceof byte[]) { - return (byte[])this.text; + return (byte[]) this.text; } else if (this.text instanceof ByteArrayOutputStream) { return ((ByteArrayOutputStream) this.text).toByteArray(); } + assert false : this.text.getClass().toString(); + return null; } catch (final Exception e) { Log.logException(e); } - return new byte[0]; + return new byte[0]; } public long getTextLength() { - if (this.text == null) return 0; - if (this.text instanceof File) return ((File) this.text).length(); - else if (this.text instanceof byte[]) return ((byte[]) this.text).length; - else if (this.text instanceof ByteArrayOutputStream) { - return ((ByteArrayOutputStream)this.text).size(); + try { + if (this.text == null) return -1; + if (this.text instanceof String) { + return ((String) this.text).length(); + } else if (this.text instanceof InputStream) { + return ((InputStream) this.text).available(); + } else if (this.text instanceof File) { + return ((File) this.text).length(); + } else if (this.text instanceof byte[]) { + return ((byte[]) this.text).length; + } else if (this.text instanceof ByteArrayOutputStream) { + return ((ByteArrayOutputStream) this.text).size(); + } + assert false : this.text.getClass().toString(); + return -1; + } catch (final Exception e) { + Log.logException(e); } - return -1; } @@ -590,27 +609,21 @@ dc_rights } public void close() { + if (this.text == null) return; + // try close the output stream - if (this.textStream != null) { - try { - this.textStream.close(); - } catch (final Exception e) { - /* ignore this */ - } finally { - this.textStream = null; - } + if (this.text instanceof InputStream) try { + ((InputStream) this.text).close(); + } catch (final Exception e) {} finally { + this.text = null; } // delete the temp file - if ((this.text != null) && (this.text instanceof File)) { - try { - FileUtils.deletedelete((File) this.text); - } catch (final Exception e) { - /* ignore this */ - } finally { - this.text = null; - } - } + if (this.text instanceof File) try { + FileUtils.deletedelete((File) this.text); + } catch (final Exception e) {} finally { + this.text = null; + } } /** diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index f63c72681..0a988cb0b 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -140,6 +140,7 @@ public final class TextParser { ) throws InterruptedException, Parser.Failure { BufferedInputStream sourceStream = null; + Document[] docs = null; try { if (log.isFine()) log.logFine("Parsing '" + location + "' from file"); if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) { @@ -148,17 +149,17 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); + docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location); } finally { - if (sourceStream != null)try { - sourceStream.close(); - } catch (final Exception ex) {} + if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} } + for (Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } public static Document[] parseSource( @@ -193,7 +194,9 @@ public final class TextParser { // then we use only one stream-oriented parser. if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { // use a specific stream-oriented parser - return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); + Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); + for (Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } // in case that we know more parsers we first transform the content into a byte[] and use that as base @@ -204,7 +207,9 @@ public final class TextParser { } catch (IOException e) { throw new Parser.Failure(e.getMessage(), location); } - return parseSource(location, mimeType, idioms, charset, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, b); + for (Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } private static Document[] parseSource( @@ -222,7 +227,9 @@ public final class TextParser { if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { - return parser.parse(location, mimeType, documentCharset, sourceStream); + Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); + for (Document d: docs) { assert d.getText() != null; } // verify docs + return docs; } catch (Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); } @@ -240,11 +247,11 @@ public final class TextParser { final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert !parsers.isEmpty(); - Document[] doc = null; + Document[] docs = null; HashMap failedParser = new HashMap(); for (Parser parser: parsers) { try { - doc = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); + docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); } catch (Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); @@ -252,10 +259,10 @@ public final class TextParser { failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } - if (doc != null) break; + if (docs != null) break; } - if (doc == null) { + if (docs == null) { if (failedParser.size() == 0) { final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; //log.logWarning("Unable to parse '" + location + "'. " + errorMsg); @@ -269,7 +276,8 @@ public final class TextParser { throw new Parser.Failure("All parser failed: " + failedParsers, location); } } - return doc; + for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + return docs; } /** @@ -335,7 +343,6 @@ public final class TextParser { return idioms; } - public static String supportsMime(String mimeType) { if (mimeType == null) return null; mimeType = normalizeMimeType(mimeType); @@ -343,7 +350,7 @@ public final class TextParser { if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available"; return null; } - + public static String supportsExtension(final MultiProtocolURI url) { String ext = url.getFileExtension().toLowerCase(); if (ext == null || ext.length() == 0) return null; diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index 2ef4454e0..271f80691 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -58,6 +58,7 @@ public class bzipParser extends AbstractParser implements Parser { public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; + Document[] docs; try { /* * First we have to consume the first two char from the stream. Otherwise @@ -90,7 +91,7 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - return TextParser.parseSource(location, null, null, tempFile); + docs = TextParser.parseSource(location, null, null, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -99,5 +100,6 @@ public class bzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } + return docs; } } diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index eb0259603..8082d5778 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -40,7 +40,7 @@ public class genericParser extends AbstractParser implements Parser { public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException { - return new Document[]{new Document( + Document[] docs = new Document[]{new Document( location, mimeType, charset, @@ -56,5 +56,7 @@ public class genericParser extends AbstractParser implements Parser { null, null, false)}; + for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + return docs; } } diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 4367437c9..f3452b6c3 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -57,6 +57,7 @@ public class gzipParser extends AbstractParser implements Parser { public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; + Document[] docs = null; try { int read = 0; final byte[] data = new byte[1024]; @@ -77,7 +78,7 @@ public class gzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - return TextParser.parseSource(location,null,null,tempFile); + docs = TextParser.parseSource(location,null,null,tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -86,6 +87,7 @@ public class gzipParser extends AbstractParser implements Parser { } finally { if (tempFile != null) FileUtils.deletedelete(tempFile); } + return docs; } } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index 3bf0081da..ee20156b6 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -80,14 +80,13 @@ public class tarParser extends AbstractParser implements Parser { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); + if (subDocs == null) continue; + for (Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { log.logWarning("tar parser entry " + name + ": " + e.getMessage()); } finally { if (tmp != null) FileUtils.deletedelete(tmp); } - if (subDocs == null) continue; - - for (Document d: subDocs) docacc.add(d); } catch (IOException e) { log.logWarning("tar parser:" + e.getMessage()); break; diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index 548d52d77..e1fecc8c2 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -75,13 +75,13 @@ public class zipParser extends AbstractParser implements Parser { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(zis, tmp, entry.getSize()); docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp); + if (docs == null) continue; + for (Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { log.logWarning("ZIP parser entry " + name + ": " + e.getMessage()); } finally { if (tmp != null) FileUtils.deletedelete(tmp); } - if (docs == null) continue; - for (Document d: docs) docacc.add(d); } catch (IOException e) { log.logWarning("ZIP parser:" + e.getMessage()); break;