From 3ca695390cbac15a5716373b5617f649a8e608c6 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 22 Dec 2016 16:25:09 +0100 Subject: [PATCH] FTP crawl start URLs : applied crawl profile depth control Applied rules : - when the FTP URL denotes a file resource, stack it as any start URL : eventually embedded links can be followed applying the usual depth rules - when the FTP URL denotes a directory, list files under this directory and stack them for crawl, and repeat the process on sub folders until crawl depth is reached --- .../net/yacy/cora/protocol/ftp/FTPClient.java | 260 ++++++++++-------- source/net/yacy/crawler/CrawlStacker.java | 36 ++- source/net/yacy/search/Switchboard.java | 14 +- 3 files changed, 170 insertions(+), 140 deletions(-) diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 74bb90b27..7bddd6d9c 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -76,7 +76,7 @@ public class FTPClient { public static final String ANONYMOUS = "anonymous"; private static final ConcurrentLog log = new ConcurrentLog("FTPClient"); - private static final String vDATE = "20100823"; + private static final String vDATE = "20161222"; private boolean glob = true; // glob = false -> filenames are taken // literally for mget, .. @@ -2535,12 +2535,17 @@ public class FTPClient { } /** - * generate a list of all files on a ftp server using the anonymous account - * @param host - * @return a list of entryInfo from all files of the ftp server - * @throws IOException + * Asynchronously generate a list of all files on a ftp server using the anonymous account. + * @param host host name or address + * @param port ftp port + * @param user user name + * @param pw user password + * @param path path on the ftp site + * @param depth the maximum depth of the sub folders exploration. + * @return a queue asynchronously filled with entryInfo from all files of the ftp server + * @throws IOException when a error occurred */ - public static BlockingQueue sitelist(final String host, final int port, final String user, final String pw) throws IOException { + public static BlockingQueue sitelist(final String host, final int port, final String user, final String pw, final String path, final int depth) throws IOException { final FTPClient ftpClient = new FTPClient(); ftpClient.open(host, port); ftpClient.login(user, pw); @@ -2550,7 +2555,7 @@ public class FTPClient { public void run() { try { Thread.currentThread().setName("FTP.sitelist(" + host + ":" + port + ")"); - sitelist(ftpClient, "/", queue); + sitelist(ftpClient, path, queue, depth); ftpClient.quit(); } catch (final Exception e) {} finally { queue.add(POISON_entryInfo); @@ -2559,12 +2564,43 @@ public class FTPClient { }.start(); return queue; } - private static void sitelist(final FTPClient ftpClient, String path, final LinkedBlockingQueue queue) { + + /** + * Feed the queue with files under a given path on a ftp server using + * the anonymous account. When path is a file path, only one entry is added + * to the queue. + * + * @param ftpClient + * fptClient initialized with a host and login information + * @param path + * path on the host + * @param queue + * the entries queue to feed + * @param depth + * the maximum depth of the sub folders exploration. + * @throws IOException + * when a error occurred + */ + private static void sitelist(final FTPClient ftpClient, String path, final LinkedBlockingQueue queue, int depth) { List list; try { list = ftpClient.list(path, true); } catch (final IOException e) { - ConcurrentLog.logException(e); + /* path might be a file path */ + if (!path.endsWith("/")) { + entryInfo info = ftpClient.fileInfo(path); + if (info != null) { + queue.add(info); + } else { + /* We could not get file information, but this doesn't mean the file does not exist : + * we add it anyway to the queue */ + info = new entryInfo(); + info.name = path; + queue.add(info); + } + } else { + ConcurrentLog.logException(e); + } return; } if (!path.endsWith("/")) path += "/"; @@ -2577,24 +2613,25 @@ public class FTPClient { queue.add(info); } } - // then find all directories and add them recursively - for (final String line : list) { - //System.out.println("LIST:" + line); - info = parseListData(line); - if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) { - if (info.type == filetype.directory) { - sitelist(ftpClient, path + info.name, queue); - } - if (info.type == filetype.link) { - final int q = info.name.indexOf("->",0); - if (q >= 0 && info.name.indexOf("..", q) < 0) { - //System.out.println("*** LINK:" + line); - info.name = info.name.substring(0, q).trim(); - sitelist(ftpClient, path + info.name, queue); - } + // then find all directories and add them recursively if depth is over zero + if(depth > 0) { + for (final String line : list) { + //System.out.println("LIST:" + line); + info = parseListData(line); + if (info != null && !info.name.endsWith(".") && !info.name.startsWith(".")) { + if (info.type == filetype.directory) { + sitelist(ftpClient, path + info.name, queue, depth - 1); + } else if (info.type == filetype.link) { + final int q = info.name.indexOf("->",0); + if (q >= 0 && info.name.indexOf("..", q) < 0) { + //System.out.println("*** LINK:" + line); + info.name = info.name.substring(0, q).trim(); + sitelist(ftpClient, path + info.name, queue, depth - 1); + } - } - } + } + } + } } } @@ -2788,103 +2825,94 @@ public class FTPClient { } private static void printHelp() { - System.out.println("ftp help"); + System.out.println("FTPClient help"); System.out.println("----------"); System.out.println(); System.out.println("The following commands are supported"); - System.out.println("java ftp -- (without arguments) starts the shell. Thy 'help' then for shell commands."); - System.out.println("java ftp [':'] -- starts shell and connects to specified host"); - System.out.println("java ftp -h -- prints this help"); - System.out.println("java ftp -dir [':'] [ ]"); - System.out.println("java ftp -get [':'] [ ]"); - System.out.println("java ftp -put [':'] "); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -h -- prints this help"); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -dir [':'] [ ]"); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -htmldir "); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -get [':'] [ ]"); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -put [':'] "); + System.out.println("java net.yacy.cora.protocol.ftp.FTPClient -sitelist "); System.out.println(); } public static void main(final String[] args) { - System.out.println("WELCOME TO THE ANOMIC FTP CLIENT v" + vDATE); - System.out.println("Visit http://www.anomic.de and support shareware!"); - System.out.println("try -h for command line options"); - System.out.println(); - if (args.length == 1) { - if (args[0].equals("-h")) { - printHelp(); - } - if (args[0].equals("-test")) { - // test for file URL: ftp://192.168.1.90/Movie/ATest%20Ordner/Unterordner/test%20file.txt - final FTPClient ftpClient = new FTPClient(); - try { - ftpClient.open("192.168.1.90", 21); - ftpClient.login(ANONYMOUS, "anomic@"); - final byte[] b = ftpClient.get("/Movie/ATest Ordner/Unterordner/test file.txt"); - System.out.println(UTF8.String(b)); - } catch (final IOException e) { - e.printStackTrace(); - } - } - } else if (args.length == 2) { - printHelp(); - } else if (args.length == 3) { - if (args[0].equals("-dir")) { - dir(args[1], args[2], ANONYMOUS, "anomic@"); - } else if (args[0].equals("-htmldir")) { - try { - final StringBuilder page = dirhtml(args[1], 21, args[2], ANONYMOUS, "anomic@"); - final File file = new File("dirindex.html"); - FileOutputStream fos; - fos = new FileOutputStream(file); - fos.write(UTF8.getBytes(page.toString())); - fos.close(); - } catch (final FileNotFoundException e) { - log.warn(e); - } catch (final IOException e) { - log.warn(e); - } - } else if (args[0].equals("-sitelist")) { - try { - final BlockingQueue q = sitelist(args[1], Integer.parseInt(args[2]), ANONYMOUS, "anomic"); - entryInfo entry; - while ((entry = q.take()) != FTPClient.POISON_entryInfo) { - System.out.println(entry.toString()); - } - } catch (final FileNotFoundException e) { - log.warn(e); - } catch (final IOException e) { - log.warn(e); - } catch (final InterruptedException e) { - log.warn(e); - } - } else { - printHelp(); - } - } else if (args.length == 4) { - if (args[0].equals("-get")) { - getAnonymous(args[1], args[2], new File(args[3])); - } else { - printHelp(); - } - } else if (args.length == 5) { - if (args[0].equals("-dir")) { - dir(args[1], args[2], args[3], args[4]); - } else { - printHelp(); - } - } else if (args.length == 6) { - if (args[0].equals("-get")) { - get(args[1], args[2], new File(args[3]), args[4], args[5]); - } else if (args[0].equals("-put")) { - try { - put(args[1], new File(args[2]), args[3], "", args[4], args[5]); - } catch (final IOException e) { - // TODO Auto-generated catch block - log.warn(e.getMessage(), e); - } - } else { - printHelp(); - } - } else { - printHelp(); - } + try { + System.out.println("WELCOME TO THE ANOMIC FTP CLIENT v" + vDATE); + System.out.println("Visit http://www.anomic.de and support shareware!"); + System.out.println("try -h for command line options"); + System.out.println(); + if (args.length == 1) { + if (args[0].equals("-h")) { + printHelp(); + } + } else if (args.length == 2) { + printHelp(); + } else if (args.length == 3) { + if (args[0].equals("-dir")) { + dir(args[1], args[2], ANONYMOUS, "anomic@"); + } else if (args[0].equals("-htmldir")) { + try { + final StringBuilder page = dirhtml(args[1], 21, args[2], ANONYMOUS, "anomic@"); + final File file = new File("dirindex.html"); + FileOutputStream fos; + fos = new FileOutputStream(file); + fos.write(UTF8.getBytes(page.toString())); + fos.close(); + } catch (final FileNotFoundException e) { + log.warn(e); + } catch (final IOException e) { + log.warn(e); + } + } else { + printHelp(); + } + } else if (args.length == 4) { + if (args[0].equals("-get")) { + getAnonymous(args[1], args[2], new File(args[3])); + } else if (args[0].equals("-sitelist")) { + try { + final BlockingQueue q = sitelist(args[1], Integer.parseInt(args[2]), ANONYMOUS, "anomic", "/", Integer.parseInt(args[3])); + entryInfo entry; + while ((entry = q.take()) != FTPClient.POISON_entryInfo) { + System.out.println(entry.toString()); + } + } catch (final FileNotFoundException e) { + log.warn(e); + } catch (final IOException e) { + log.warn(e); + } catch (final InterruptedException e) { + log.warn(e); + } + } else { + printHelp(); + } + } else if (args.length == 5) { + if (args[0].equals("-dir")) { + dir(args[1], args[2], args[3], args[4]); + } else { + printHelp(); + } + } else if (args.length == 6) { + if (args[0].equals("-get")) { + get(args[1], args[2], new File(args[3]), args[4], args[5]); + } else if (args[0].equals("-put")) { + try { + put(args[1], new File(args[2]), args[3], "", args[4], args[5]); + } catch (final IOException e) { + log.warn(e.getMessage(), e); + } + } else { + printHelp(); + } + } else { + printHelp(); + } + } finally { + ConcurrentLog.shutdown(); + } } } diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index 622f41ac0..080e5e1c0 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -227,12 +227,9 @@ public final class CrawlStacker { } if (url.getProtocol().equals("ftp")) { - // put the whole ftp site on the crawl stack - String userInfo = url.getUserInfo(); - int p = userInfo == null ? -1 : userInfo.indexOf(':'); - String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); - String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); - enqueueEntriesFTP(initiator, profileHandle, url.getHost(), url.getPort(), user, pw, replace, timezoneOffset); + /* put ftp site entries on the crawl stack, + * using the crawl profile depth to control how many children folders of the url are stacked */ + enqueueEntriesFTP(initiator, profile, url, replace, timezoneOffset); } else { // put entry on crawl stack enqueueEntry(new Request( @@ -248,24 +245,35 @@ public final class CrawlStacker { } } } - + + /** + * Asynchronously enqueue crawl start entries for a ftp url. + * @param initiator Hash of the peer initiating the crawl + * @param profile the active crawl profile + * @param ftpURL crawl start point URL : protocol must be ftp + * @param replace Specify whether old indexed entries should be replaced + * @param timezoneOffset local time-zone offset + */ public void enqueueEntriesFTP( final byte[] initiator, - final String profileHandle, - final String host, - final int port, - final String user, - final String pw, + final CrawlProfile profile, + final DigestURL ftpURL, final boolean replace, final int timezoneOffset) { final CrawlQueues cq = this.nextQueue; + final String userInfo = ftpURL.getUserInfo(); + final int p = userInfo == null ? -1 : userInfo.indexOf(':'); + final String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); + final String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); + final String host = ftpURL.getHost(); + final int port = ftpURL.getPort(); new Thread() { @Override public void run() { Thread.currentThread().setName("enqueueEntriesFTP"); BlockingQueue queue; try { - queue = FTPClient.sitelist(host, port, user, pw); + queue = FTPClient.sitelist(host, port, user, pw, ftpURL.getPath(), profile.depth()); FTPClient.entryInfo entry; while ((entry = queue.take()) != FTPClient.POISON_entryInfo) { @@ -289,7 +297,7 @@ public final class CrawlStacker { null, MultiProtocolURL.unescape(entry.name), entry.date, - profileHandle, + profile.handle(), 0, timezoneOffset)); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index f13091249..fb9de3661 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -116,7 +116,6 @@ import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.TimeoutRequest; -import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.protocol.http.ProxySettings; import net.yacy.cora.util.CommonPattern; @@ -3223,17 +3222,12 @@ public final class Switchboard extends serverSwitch { if (url.isFTP()) { try { this.crawler.putActive(handle, profile); - String userInfo = url.getUserInfo(); - int p = userInfo == null ? -1 : userInfo.indexOf(':'); - String user = userInfo == null ? FTPClient.ANONYMOUS : userInfo.substring(0, p); - String pw = userInfo == null || p == -1 ? "anomic" : userInfo.substring(p + 1); + /* put ftp site entries on the crawl stack, + * using the crawl profile depth to control how many children folders of the url are stacked */ this.crawlStacker.enqueueEntriesFTP( this.peers.mySeed().hash.getBytes(), - profile.handle(), - url.getHost(), - url.getPort(), - user, - pw, + profile, + url, false, profile.timezoneOffset()); return null;