diff --git a/source/de/anomic/net/URL.java b/source/de/anomic/net/URL.java index 4654d2b89..ea4d170c7 100644 --- a/source/de/anomic/net/URL.java +++ b/source/de/anomic/net/URL.java @@ -116,6 +116,7 @@ public class URL { this.protocol = baseURL.protocol; this.host = baseURL.host; this.port = baseURL.port; + this.userInfo = baseURL.userInfo; if (relPath.toLowerCase().startsWith("javascript:")) { this.path = baseURL.path; } else if (relPath.startsWith("/")) { diff --git a/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java index 0af2f5734..a49575728 100644 --- a/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java +++ b/source/de/anomic/plasma/crawler/ftp/CrawlWorker.java @@ -51,7 +51,9 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.PrintStream; +import java.io.PrintWriter; +import de.anomic.net.URL; import de.anomic.net.ftpc; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; @@ -82,8 +84,7 @@ public class CrawlWorker extends AbstractCrawlWorker implements public Entry load() throws IOException { - File cacheFile = cacheManager.getCachePath(url); - cacheFile.getParentFile().mkdirs(); + ByteArrayOutputStream bout = new ByteArrayOutputStream(); PrintStream out = new PrintStream(bout); @@ -104,27 +105,61 @@ public class CrawlWorker extends AbstractCrawlWorker implements userPwd = "anonymous"; } - ftpClient.exec("open " + this.url.getHost(), false); + String host = this.url.getHost(); + String fullPath = this.url.getPath(); + int port = this.url.getPort(); + + if (port == -1) { + ftpClient.exec("open " + this.url.getHost(), false); + } else { + ftpClient.exec("open " + this.url.getHost() + " " + port, false); + } ftpClient.exec("user " + userName + " " + userPwd, false); ftpClient.exec("binary", false); // cd - String file = ""; - String path = this.url.getPath(); - int pos = path.lastIndexOf("/"); - if (pos == -1) { - file = path; - path = "/"; + String file, path; + + if (fullPath.endsWith("/")) { + file = ""; + path = fullPath; } else { - file = path.substring(pos+1); - path = path.substring(0,pos); + int pos = fullPath.lastIndexOf("/"); + if (pos == -1) { + file = fullPath; + path = "/"; + } else { + path = fullPath.substring(0,pos+1); + file = fullPath.substring(pos+1); + } + } + + if (file.length() > 0) { + ftpClient.exec("cd \"" + path + "\"", false); + + // testing if the current name is a directoy + boolean isFolder = ftpClient.isFolder(file); + if (isFolder) { + fullPath = fullPath + "/"; + file = ""; + this.url = new URL(this.url,fullPath); + } } - ftpClient.exec("cd \"" + path + "\"", false); - if (ftpClient.isFolder(file)) { - ftpClient.exec("cd \"" + file + "\"", false); + // creating a cache file object + File cacheFile = cacheManager.getCachePath(this.url); + cacheFile.getParentFile().mkdirs(); + + ftpClient.exec("cd \"" + fullPath + "\"", false); + if (file.length() == 0) { + + // getting the dirlist + StringBuffer dirList = ftpClient.dirhtml(host, (port==-1)?21:port, fullPath, userName, userPwd); - // TODO: dirlist + // write it into a file + PrintWriter writer = new PrintWriter(cacheFile); + writer.write(dirList.toString()); + writer.close(); } else { // download the remote file ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false); @@ -132,6 +167,8 @@ public class CrawlWorker extends AbstractCrawlWorker implements ftpClient.exec("close", false); ftpClient.exec("exit", false); + + // TODO: do mimetype detection based of file extension // TODO: create a new htCache entry ....