*) URL.java: userinfo was not taken over when generating a new url from a base url and a rel. path

*) CrawlWorker.java: using new dirhtml function of ftpc

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2492 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 17ba468165
commit 7930839594

@ -116,6 +116,7 @@ public class URL {
this.protocol = baseURL.protocol; this.protocol = baseURL.protocol;
this.host = baseURL.host; this.host = baseURL.host;
this.port = baseURL.port; this.port = baseURL.port;
this.userInfo = baseURL.userInfo;
if (relPath.toLowerCase().startsWith("javascript:")) { if (relPath.toLowerCase().startsWith("javascript:")) {
this.path = baseURL.path; this.path = baseURL.path;
} else if (relPath.startsWith("/")) { } else if (relPath.startsWith("/")) {

@ -51,7 +51,9 @@ import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.PrintStream; import java.io.PrintStream;
import java.io.PrintWriter;
import de.anomic.net.URL;
import de.anomic.net.ftpc; import de.anomic.net.ftpc;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -82,8 +84,7 @@ public class CrawlWorker extends AbstractCrawlWorker implements
public Entry load() throws IOException { public Entry load() throws IOException {
File cacheFile = cacheManager.getCachePath(url);
cacheFile.getParentFile().mkdirs();
ByteArrayOutputStream bout = new ByteArrayOutputStream(); ByteArrayOutputStream bout = new ByteArrayOutputStream();
PrintStream out = new PrintStream(bout); PrintStream out = new PrintStream(bout);
@ -104,27 +105,61 @@ public class CrawlWorker extends AbstractCrawlWorker implements
userPwd = "anonymous"; userPwd = "anonymous";
} }
String host = this.url.getHost();
String fullPath = this.url.getPath();
int port = this.url.getPort();
if (port == -1) {
ftpClient.exec("open " + this.url.getHost(), false); ftpClient.exec("open " + this.url.getHost(), false);
} else {
ftpClient.exec("open " + this.url.getHost() + " " + port, false);
}
ftpClient.exec("user " + userName + " " + userPwd, false); ftpClient.exec("user " + userName + " " + userPwd, false);
ftpClient.exec("binary", false); ftpClient.exec("binary", false);
// cd // cd
String file = ""; String file, path;
String path = this.url.getPath();
int pos = path.lastIndexOf("/"); if (fullPath.endsWith("/")) {
file = "";
path = fullPath;
} else {
int pos = fullPath.lastIndexOf("/");
if (pos == -1) { if (pos == -1) {
file = path; file = fullPath;
path = "/"; path = "/";
} else { } else {
file = path.substring(pos+1); path = fullPath.substring(0,pos+1);
path = path.substring(0,pos); file = fullPath.substring(pos+1);
}
} }
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false); ftpClient.exec("cd \"" + path + "\"", false);
if (ftpClient.isFolder(file)) { // testing if the current name is a directoy
ftpClient.exec("cd \"" + file + "\"", false); boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
fullPath = fullPath + "/";
file = "";
this.url = new URL(this.url,fullPath);
}
}
// creating a cache file object
File cacheFile = cacheManager.getCachePath(this.url);
cacheFile.getParentFile().mkdirs();
ftpClient.exec("cd \"" + fullPath + "\"", false);
if (file.length() == 0) {
// TODO: dirlist // getting the dirlist
StringBuffer dirList = ftpClient.dirhtml(host, (port==-1)?21:port, fullPath, userName, userPwd);
// write it into a file
PrintWriter writer = new PrintWriter(cacheFile);
writer.write(dirList.toString());
writer.close();
} else { } else {
// download the remote file // download the remote file
ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false); ftpClient.exec("get \"" + file + "\" \"" + cacheFile.getAbsolutePath() + "\"", false);
@ -133,6 +168,8 @@ public class CrawlWorker extends AbstractCrawlWorker implements
ftpClient.exec("close", false); ftpClient.exec("close", false);
ftpClient.exec("exit", false); ftpClient.exec("exit", false);
// TODO: do mimetype detection based of file extension
// TODO: create a new htCache entry .... // TODO: create a new htCache entry ....
return null; return null;

Loading…
Cancel
Save