fixed bugs in parser and ftp client

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7360 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent d78e322e84
commit 4e2c14efbb

@ -140,6 +140,7 @@ public class FTPLoader {
response = getFile(ftpClient, request, acceptOnlyParseable); response = getFile(ftpClient, request, acceptOnlyParseable);
} catch (final Exception e) { } catch (final Exception e) {
// add message to errorLog // add message to errorLog
e.printStackTrace();
(new PrintStream(berr)).print(e.getMessage()); (new PrintStream(berr)).print(e.getMessage());
} }
} }
@ -149,9 +150,9 @@ public class FTPLoader {
// pass the downloaded resource to the cache manager // pass the downloaded resource to the cache manager
if (berr.size() > 0 || response == null) { if (berr.size() > 0 || response == null) {
// some error logging // some error logging
final String detail = (berr.size() > 0) ? "\n Errorlog: " + berr.toString() : ""; final String detail = (berr.size() > 0) ? "Errorlog: " + berr.toString() : "";
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "server download" + detail); sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, " ftp server download, " + detail);
throw new IOException("FTPLoader: Unable to download URL " + request.url().toString() + detail); throw new IOException("FTPLoader: Unable to download URL '" + request.url().toString() + "': " + detail);
} }
Latency.update(request.url(), System.currentTimeMillis() - start); Latency.update(request.url(), System.currentTimeMillis() - start);
@ -207,7 +208,7 @@ public class FTPLoader {
return true; return true;
} }
private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception { private Response getFile(final FTPClient ftpClient, final Request request, boolean acceptOnlyParseable) throws IOException {
// determine the mimetype of the resource // determine the mimetype of the resource
final DigestURI url = request.url(); final DigestURI url = request.url();
final String mime = TextParser.mimeOf(url); final String mime = TextParser.mimeOf(url);
@ -218,7 +219,10 @@ public class FTPLoader {
// create response header // create response header
RequestHeader requestHeader = new RequestHeader(); RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false)); if (request.referrerhash() != null) {
DigestURI refurl = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (refurl != null) requestHeader.put(RequestHeader.REFERER, refurl.toNormalform(true, false));
}
ResponseHeader responseHeader = new ResponseHeader(); ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate)); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

@ -1779,7 +1779,7 @@ public final class Switchboard extends serverSwitch {
return null; return null;
} }
} }
assert b != null;
try { try {
// parse the document // parse the document
documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b); documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b);

@ -2294,7 +2294,7 @@ public class FTPClient {
// boolean success = !isNotPositiveCompletion(reply); // boolean success = !isNotPositiveCompletion(reply);
} finally { } finally {
// shutdown connection // shutdown connection
if(ClientStream != null) { if (ClientStream != null) {
ClientStream.close(); ClientStream.close();
} }
closeDataSocket(); closeDataSocket();

@ -132,6 +132,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
if (indexText) { if (indexText) {
assert document.getText() != null : document.dc_identifier();
createCondensement(document.getText(), meaningLib); createCondensement(document.getText(), meaningLib);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL

@ -77,7 +77,6 @@ public class Document {
private Map<String, String> emaillinks; private Map<String, String> emaillinks;
private MultiProtocolURI favicon; private MultiProtocolURI favicon;
private boolean resorted; private boolean resorted;
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages; private Set<String> languages;
private boolean indexingDenied; private boolean indexingDenied;
@ -228,15 +227,19 @@ dc_rights
public InputStream getText() { public InputStream getText() {
try { try {
if (this.text == null) return new ByteArrayInputStream("".getBytes()); if (this.text == null) return new ByteArrayInputStream("".getBytes());
if (this.text instanceof String) {
if (this.text instanceof File) { return new ByteArrayInputStream(((String) this.text).getBytes("UTF-8"));
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof InputStream) {
return (InputStream) this.text;
} else if (this.text instanceof File) {
return new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[]) this.text); return new ByteArrayInputStream((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
this.textStream = new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray()); return new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
} }
return this.textStream; assert false : this.text.getClass().toString();
return null;
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
} }
@ -246,14 +249,19 @@ dc_rights
public byte[] getTextBytes() { public byte[] getTextBytes() {
try { try {
if (this.text == null) return new byte[0]; if (this.text == null) return new byte[0];
if (this.text instanceof String) {
if (this.text instanceof File) { return ((String) this.text).getBytes("UTF-8");
return FileUtils.read((File)this.text); } else if (this.text instanceof InputStream) {
return FileUtils.read((InputStream) this.text);
} else if (this.text instanceof File) {
return FileUtils.read((File) this.text);
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
return (byte[])this.text; return (byte[]) this.text;
} else if (this.text instanceof ByteArrayOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).toByteArray(); return ((ByteArrayOutputStream) this.text).toByteArray();
} }
assert false : this.text.getClass().toString();
return null;
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
} }
@ -261,13 +269,24 @@ dc_rights
} }
public long getTextLength() { public long getTextLength() {
if (this.text == null) return 0; try {
if (this.text instanceof File) return ((File) this.text).length(); if (this.text == null) return -1;
else if (this.text instanceof byte[]) return ((byte[]) this.text).length; if (this.text instanceof String) {
else if (this.text instanceof ByteArrayOutputStream) { return ((String) this.text).length();
return ((ByteArrayOutputStream)this.text).size(); } else if (this.text instanceof InputStream) {
return ((InputStream) this.text).available();
} else if (this.text instanceof File) {
return ((File) this.text).length();
} else if (this.text instanceof byte[]) {
return ((byte[]) this.text).length;
} else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).size();
}
assert false : this.text.getClass().toString();
return -1;
} catch (final Exception e) {
Log.logException(e);
} }
return -1; return -1;
} }
@ -590,26 +609,20 @@ dc_rights
} }
public void close() { public void close() {
if (this.text == null) return;
// try close the output stream // try close the output stream
if (this.textStream != null) { if (this.text instanceof InputStream) try {
try { ((InputStream) this.text).close();
this.textStream.close(); } catch (final Exception e) {} finally {
} catch (final Exception e) { this.text = null;
/* ignore this */
} finally {
this.textStream = null;
}
} }
// delete the temp file // delete the temp file
if ((this.text != null) && (this.text instanceof File)) { if (this.text instanceof File) try {
try { FileUtils.deletedelete((File) this.text);
FileUtils.deletedelete((File) this.text); } catch (final Exception e) {} finally {
} catch (final Exception e) { this.text = null;
/* ignore this */
} finally {
this.text = null;
}
} }
} }

@ -140,6 +140,7 @@ public final class TextParser {
) throws InterruptedException, Parser.Failure { ) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null; BufferedInputStream sourceStream = null;
Document[] docs = null;
try { try {
if (log.isFine()) log.logFine("Parsing '" + location + "' from file"); if (log.isFine()) log.logFine("Parsing '" + location + "' from file");
if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) { if (!sourceFile.exists() || !sourceFile.canRead() || sourceFile.length() == 0) {
@ -148,17 +149,17 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
return parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e); log.logSevere("Unexpected exception in parseSource from File: " + e.getMessage(), e);
throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location); throw new Parser.Failure("Unexpected exception: " + e.getMessage(), location);
} finally { } finally {
if (sourceStream != null)try { if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
sourceStream.close();
} catch (final Exception ex) {}
} }
for (Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
} }
public static Document[] parseSource( public static Document[] parseSource(
@ -193,7 +194,9 @@ public final class TextParser {
// then we use only one stream-oriented parser. // then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) { if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser // use a specific stream-oriented parser
return parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream); Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
for (Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
} }
// in case that we know more parsers we first transform the content into a byte[] and use that as base // in case that we know more parsers we first transform the content into a byte[] and use that as base
@ -204,7 +207,9 @@ public final class TextParser {
} catch (IOException e) { } catch (IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} }
return parseSource(location, mimeType, idioms, charset, b); Document[] docs = parseSource(location, mimeType, idioms, charset, b);
for (Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
} }
private static Document[] parseSource( private static Document[] parseSource(
@ -222,7 +227,9 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try { try {
return parser.parse(location, mimeType, documentCharset, sourceStream); Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
} catch (Exception e) { } catch (Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
} }
@ -240,11 +247,11 @@ public final class TextParser {
final String documentCharset = htmlParser.patchCharsetEncoding(charset); final String documentCharset = htmlParser.patchCharsetEncoding(charset);
assert !parsers.isEmpty(); assert !parsers.isEmpty();
Document[] doc = null; Document[] docs = null;
HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>(); HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
for (Parser parser: parsers) { for (Parser parser: parsers) {
try { try {
doc = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
} catch (Parser.Failure e) { } catch (Parser.Failure e) {
failedParser.put(parser, e); failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
@ -252,10 +259,10 @@ public final class TextParser {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} }
if (doc != null) break; if (docs != null) break;
} }
if (doc == null) { if (docs == null) {
if (failedParser.size() == 0) { if (failedParser.size() == 0) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg); //log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
@ -269,7 +276,8 @@ public final class TextParser {
throw new Parser.Failure("All parser failed: " + failedParsers, location); throw new Parser.Failure("All parser failed: " + failedParsers, location);
} }
} }
return doc; for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
return docs;
} }
/** /**
@ -335,7 +343,6 @@ public final class TextParser {
return idioms; return idioms;
} }
public static String supportsMime(String mimeType) { public static String supportsMime(String mimeType) {
if (mimeType == null) return null; if (mimeType == null) return null;
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);

@ -58,6 +58,7 @@ public class bzipParser extends AbstractParser implements Parser {
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs;
try { try {
/* /*
* First we have to consume the first two char from the stream. Otherwise * First we have to consume the first two char from the stream. Otherwise
@ -90,7 +91,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
return TextParser.parseSource(location, null, null, tempFile); docs = TextParser.parseSource(location, null, null, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -99,5 +100,6 @@ public class bzipParser extends AbstractParser implements Parser {
} finally { } finally {
if (tempFile != null) FileUtils.deletedelete(tempFile); if (tempFile != null) FileUtils.deletedelete(tempFile);
} }
return docs;
} }
} }

@ -40,7 +40,7 @@ public class genericParser extends AbstractParser implements Parser {
public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException { public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {
return new Document[]{new Document( Document[] docs = new Document[]{new Document(
location, location,
mimeType, mimeType,
charset, charset,
@ -56,5 +56,7 @@ public class genericParser extends AbstractParser implements Parser {
null, null,
null, null,
false)}; false)};
for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
return docs;
} }
} }

@ -57,6 +57,7 @@ public class gzipParser extends AbstractParser implements Parser {
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
Document[] docs = null;
try { try {
int read = 0; int read = 0;
final byte[] data = new byte[1024]; final byte[] data = new byte[1024];
@ -77,7 +78,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
return TextParser.parseSource(location,null,null,tempFile); docs = TextParser.parseSource(location,null,null,tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -86,6 +87,7 @@ public class gzipParser extends AbstractParser implements Parser {
} finally { } finally {
if (tempFile != null) FileUtils.deletedelete(tempFile); if (tempFile != null) FileUtils.deletedelete(tempFile);
} }
return docs;
} }
} }

@ -80,14 +80,13 @@ public class tarParser extends AbstractParser implements Parser {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
if (subDocs == null) continue;
for (Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
log.logWarning("tar parser entry " + name + ": " + e.getMessage()); log.logWarning("tar parser entry " + name + ": " + e.getMessage());
} finally { } finally {
if (tmp != null) FileUtils.deletedelete(tmp); if (tmp != null) FileUtils.deletedelete(tmp);
} }
if (subDocs == null) continue;
for (Document d: subDocs) docacc.add(d);
} catch (IOException e) { } catch (IOException e) {
log.logWarning("tar parser:" + e.getMessage()); log.logWarning("tar parser:" + e.getMessage());
break; break;

@ -75,13 +75,13 @@ public class zipParser extends AbstractParser implements Parser {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp); docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp);
if (docs == null) continue;
for (Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
log.logWarning("ZIP parser entry " + name + ": " + e.getMessage()); log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
} finally { } finally {
if (tmp != null) FileUtils.deletedelete(tmp); if (tmp != null) FileUtils.deletedelete(tmp);
} }
if (docs == null) continue;
for (Document d: docs) docacc.add(d);
} catch (IOException e) { } catch (IOException e) {
log.logWarning("ZIP parser:" + e.getMessage()); log.logWarning("ZIP parser:" + e.getMessage());
break; break;

Loading…
Cancel
Save