another fix to the ftp crawler: now correct directory listings according to rfc2640 (path with spaces) and better title names for such files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7386 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 9b25a33fd9
commit 59b70a5a92

@ -65,7 +65,6 @@ import java.util.concurrent.LinkedBlockingQueue;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
@ -1045,7 +1044,7 @@ public class FTPClient {
private static entryInfo parseListData(final String line) { private static entryInfo parseListData(final String line) {
// groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name // groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name
final Matcher tokens = lsStyle.matcher(line); final Matcher tokens = lsStyle.matcher(line);
if (tokens.matches()) { if (tokens.matches() && tokens.groupCount() == 6) {
filetype type = filetype.file; filetype type = filetype.file;
if (tokens.group(1).startsWith("d")) type = filetype.directory; if (tokens.group(1).startsWith("d")) type = filetype.directory;
if (tokens.group(1).startsWith("l")) type = filetype.link; if (tokens.group(1).startsWith("l")) type = filetype.link;
@ -1341,18 +1340,26 @@ public class FTPClient {
createDataSocket(); createDataSocket();
send("CWD " + path);
String reply = receive();
// get status code
int status = getStatus(reply);
if (status > 2) {
throw new IOException(reply);
}
// send command to the control port // send command to the control port
if (extended) { if (extended) {
send("LIST " + path); send("LIST");
} else { } else {
send("NLST " + path); send("NLST");
} }
// read status of the command from the control port // read status of the command from the control port
String reply = receive(); reply = receive();
// get status code // get status code
final int status = getStatus(reply); status = getStatus(reply);
if (status != 1) { if (status != 1) {
throw new IOException(reply); throw new IOException(reply);
} }
@ -1383,6 +1390,7 @@ public class FTPClient {
reply = receive(); reply = receive();
//System.out.println("reply of LIST: " + reply); //System.out.println("reply of LIST: " + reply);
// boolean success = !isNotPositiveCompletion(reply); // boolean success = !isNotPositiveCompletion(reply);
//for (String s: files) System.out.println("FILES of '" + path + "': " + s);
files.trimToSize(); files.trimToSize();
return files; return files;
@ -2547,7 +2555,7 @@ public class FTPClient {
for (final String line : list) { for (final String line : list) {
info = parseListData(line); info = parseListData(line);
if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) { if (info != null && info.type == filetype.file && !info.name.endsWith(".") && !info.name.startsWith(".")) {
if (!info.name.startsWith("/")) info.name = path + MultiProtocolURI.escape(info.name); if (!info.name.startsWith("/")) info.name = path + info.name;
queue.add(info); queue.add(info);
} }
} }
@ -2556,13 +2564,13 @@ public class FTPClient {
info = parseListData(line); info = parseListData(line);
if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) { if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) {
if (info.type == filetype.directory) { if (info.type == filetype.directory) {
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue); sitelist(ftpClient, path + info.name, queue);
} }
if (info.type == filetype.link) { if (info.type == filetype.link) {
int q = info.name.indexOf("->"); int q = info.name.indexOf("->");
if (q >= 0) { if (q >= 0) {
info.name = info.name.substring(0, q).trim(); info.name = info.name.substring(0, q).trim();
sitelist(ftpClient, path + MultiProtocolURI.escape(info.name), queue); sitelist(ftpClient, path + info.name, queue);
} }
} }
} }
@ -2603,7 +2611,7 @@ public class FTPClient {
final boolean metaRobotNoindex) { final boolean metaRobotNoindex) {
// this creates the html output from collected strings // this creates the html output from collected strings
final StringBuilder page = new StringBuilder(1024); final StringBuilder page = new StringBuilder(1024);
final String title = "Index of " + MultiProtocolURI.unescape(base); final String title = "Index of " + base;
page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n"); page.append("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\">\n");
page.append("<html><head>\n"); page.append("<html><head>\n");

@ -46,7 +46,7 @@ public class genericParser extends AbstractParser implements Parser {
charset, charset,
null, null,
null, null,
MultiProtocolURI.unescape(location.getFileName()), // title location.getFileName().length() == 0 ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()), // title
"", // author "", // author
location.getHost(), location.getHost(),
null, null,

Loading…
Cancel
Save