From c36da90261cde51c00e76d7e31ce8b3481e5491b Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 9 Dec 2010 17:17:25 +0000 Subject: [PATCH] added a very fast ftp file list generator to site crawler: - when a site-crawl for ftp sites is now started, then a special directory-tree harvester gets the complete directory structure of a ftp server at once - the harvester runs concurrently and feeds into the normal crawl queue also in this: - fixed the 'start from file' crawl function - added a link detector for the html parser. The html parser can now also extract links that are not included in tags. - this causes that a crawl start is now also possible from clear text link files git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7367 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Crawler_p.java | 78 +++++++---- htroot/js/IndexCreate.js | 1 + source/de/anomic/crawler/CrawlStacker.java | 90 +++++++++++-- .../net/yacy/cora/protocol/ftp/FTPClient.java | 123 ++++++++++-------- .../document/parser/html/ContentScraper.java | 22 ++++ 5 files changed, 224 insertions(+), 90 deletions(-) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index d114a3e15..e24d3de29 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -127,7 +127,10 @@ public class Crawler_p { String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url // add the prefix http:// if necessary int pos = crawlingStart.indexOf("://"); - if (pos == -1) crawlingStart = "http://" + crawlingStart; + if (pos == -1) { + if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; + if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; + } // normalize URL DigestURI crawlingStartURL = null; @@ -148,6 +151,8 @@ public class Crawler_p { newcrawlingMustMatch = "file://" + crawlingStartURL.getPath() + ".*"; } else if (crawlingStartURL.isSMB()) { newcrawlingMustMatch = "smb://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*"; + } else if (crawlingStartURL.isFTP()) { + newcrawlingMustMatch = "ftp://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*"; } else { newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; } @@ -189,10 +194,10 @@ public class Crawler_p { // store this call as api call if (repeat_time > 0) { // store as scheduled api call - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3)); } else { // store just a protocol - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart)); } final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off")); @@ -225,7 +230,44 @@ public class Crawler_p { env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); final String crawlingMode = post.get("crawlingMode","url"); - if ("url".equals(crawlingMode)) { + if (crawlingStart != null && crawlingStart.startsWith("ftp")) { + try { + // check if the crawl filter works correctly + Pattern.compile(newcrawlingMustMatch); + final CrawlProfile profile = new CrawlProfile( + crawlingStart, + crawlingStartURL, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, + crawlingIfOlder, + crawlingDomMaxPages, + crawlingQ, + indexText, + indexMedia, + storeHTCache, + crawlOrder, + xsstopw, + xdstopw, + xpstopw, + cachePolicy); + sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + final DigestURI url = crawlingStartURL; + sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false); + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); // crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_error", e.getMessage()); + } catch (final Exception e) { + // mist + prop.put("info", "7"); // Error with file + prop.putHTML("info_crawlingStart", crawlingStart); + prop.putHTML("info_error", e.getMessage()); + Log.logException(e); + } + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } else if ("url".equals(crawlingMode)) { // check if pattern matches if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { @@ -334,12 +376,12 @@ public class Crawler_p { reasonString); } } catch (final PatternSyntaxException e) { - prop.put("info", "4"); //crawlfilter does not match url + prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } catch (final Exception e) { // mist - prop.put("info", "6");//Error with url + prop.put("info", "6"); // Error with url prop.putHTML("info_crawlingStart", crawlingStart); prop.putHTML("info_error", e.getMessage()); Log.logException(e); @@ -378,32 +420,14 @@ public class Crawler_p { cachePolicy); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - final Iterator> linkiterator = hyperlinks.entrySet().iterator(); - DigestURI nexturl; - while (linkiterator.hasNext()) { - final Map.Entry e = linkiterator.next(); - if (e.getKey() == null) continue; - nexturl = new DigestURI(e.getKey()); - sb.crawlStacker.enqueueEntry(new Request( - sb.peers.mySeed().hash.getBytes(), - nexturl, - null, - e.getValue(), - new Date(), - profile.handle(), - 0, - 0, - 0 - )); - } - + sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true); } catch (final PatternSyntaxException e) { - prop.put("info", "4"); //crawlfilter does not match url + prop.put("info", "4"); // crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } catch (final Exception e) { // mist - prop.put("info", "7");//Error with file + prop.put("info", "7"); // Error with file prop.putHTML("info_crawlingStart", fileName); prop.putHTML("info_error", e.getMessage()); Log.logException(e); diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index a33cf505e..41f7f5378 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -74,5 +74,6 @@ function loadInfos() { document.getElementById("ajax").setAttribute("src",AJAX_ON); url=document.getElementById("crawlingURL").value; + if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false; sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url); } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index d52d1c34d..50821a7f0 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -28,12 +28,17 @@ package de.anomic.crawler; +import java.io.IOException; import java.net.InetAddress; +import java.net.MalformedURLException; import java.net.UnknownHostException; import java.util.Date; import java.util.Map; +import java.util.concurrent.BlockingQueue; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.logging.Log; @@ -131,15 +136,7 @@ public final class CrawlStacker { // we just don't know anything about that host return false; } - - /* - public boolean job() { - if (this.fastQueue.queueSize() > 0 && job(this.fastQueue)) return true; - if (this.slowQueue.queueSize() == 0) return false; - return job(this.slowQueue); - } - */ - + public Request job(Request entry) { // this is the method that is called by the busy thread from outside if (entry == null) return null; @@ -180,6 +177,81 @@ public final class CrawlStacker { } } + public void queueEntries(byte[] initiator, String profileHandle, Map hyperlinks, boolean replace) { + for (Map.Entry e: hyperlinks.entrySet()) { + if (e.getKey() == null) continue; + + // delete old entry, if exists to force a re-load of the url (thats wanted here) + final DigestURI url = new DigestURI(e.getKey()); + final byte[] urlhash = url.hash(); + if (replace) { + indexSegment.urlMetadata().remove(urlhash); + this.nextQueue.noticeURL.removeByURLHash(urlhash); + this.nextQueue.errorURL.remove(urlhash); + } + + // put entry on crawl stack + enqueueEntry(new Request( + initiator, + url, + null, + e.getValue(), + new Date(), + profileHandle, + 0, + 0, + 0 + )); + } + } + + public void queueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) { + final CrawlQueues cq = this.nextQueue; + new Thread() { + public void run() { + BlockingQueue queue; + try { + queue = FTPClient.sitelist(host, port); + FTPClient.entryInfo entry; + while ((entry = queue.take()) != FTPClient.POISON_entryInfo) { + + // delete old entry, if exists to force a re-load of the url (thats wanted here) + DigestURI url = null; + try { + if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name); + else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name); + else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name); + else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name); + } catch (MalformedURLException e) { + continue; + } + final byte[] urlhash = url.hash(); + if (replace) { + indexSegment.urlMetadata().remove(urlhash); + cq.noticeURL.removeByURLHash(urlhash); + cq.errorURL.remove(urlhash); + } + + // put entry on crawl stack + enqueueEntry(new Request( + initiator, + url, + null, + entry.name, + entry.date, + profileHandle, + 0, + 0, + 0 + )); + } + } catch (IOException e1) { + } catch (InterruptedException e) { + } + } + }.start(); + } + public String stackCrawl(final Request entry) { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 8fdf4478d..9bb8e2623 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -617,7 +617,7 @@ public class FTPClient { // /// try to parse LIST output (1 command) final entryInfo info = fileInfo(path); if (info != null) { - return info.isDir; + return info.type == filetype.directory; } // /// try to change to folder (4 commands) @@ -1045,7 +1045,9 @@ public class FTPClient { // groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name final Matcher tokens = lsStyle.matcher(line); if (tokens.matches()) { - final boolean isDir = tokens.group(1).startsWith("d"); + filetype type = filetype.file; + if (tokens.group(1).startsWith("d")) type = filetype.directory; + if (tokens.group(1).startsWith("l")) type = filetype.link; int size = -1; try { size = Integer.parseInt(tokens.group(2)); @@ -1076,7 +1078,7 @@ public class FTPClient { log.warn("---- Error: not ls date-format '" + dateString, e); date = new Date(); } - return new entryInfo(isDir, size, date, tokens.group(6)); + return new entryInfo(type, size, date, tokens.group(6)); } return null; } @@ -1084,6 +1086,10 @@ public class FTPClient { public static final entryInfo POISON_entryInfo = new entryInfo(); + public static enum filetype { + file, link, directory; + } + /** * parameter class * @@ -1092,9 +1098,9 @@ public class FTPClient { */ public static class entryInfo { /** - * is this a directory? + * file type */ - public final boolean isDir; + public final filetype type; /** * size in bytes */ @@ -1109,7 +1115,7 @@ public class FTPClient { public String name; public entryInfo() { - this.isDir = false; + this.type = filetype.file; this.size = -1; this.date = null; this.name = null; @@ -1124,8 +1130,8 @@ public class FTPClient { * @param date * @param name */ - public entryInfo(final boolean isDir, final int size, final Date date, final String name) { - this.isDir = isDir; + public entryInfo(final filetype type, final int size, final Date date, final String name) { + this.type = type; this.size = size; this.date = date; this.name = name; @@ -1139,8 +1145,8 @@ public class FTPClient { public String toString() { final StringBuilder info = new StringBuilder(100); info.append(name); - info.append(" (isDir="); - info.append(isDir); + info.append(" (type="); + info.append(type.name()); info.append(", size="); info.append(size); info.append(", "); @@ -1349,28 +1355,32 @@ public class FTPClient { } // starting data transaction - final Socket data = getDataSocket(); - final BufferedReader ClientStream = new BufferedReader(new InputStreamReader(data.getInputStream())); + final Socket dataSocket = getDataSocket(); + final BufferedReader dataStream = new BufferedReader(new InputStreamReader(dataSocket.getInputStream())); // read file system data String line; final ArrayList files = new ArrayList(); try { - while ((line = ClientStream.readLine()) != null) { + while ((line = dataStream.readLine()) != null) { if (!line.startsWith("total ")) { files.add(line); } } - // after stream is empty we should get control completion echo - /*reply =*/ receive(); - - // boolean success = !isNotPositiveCompletion(reply); - - // shutdown connection - ClientStream.close(); // Closing the returned InputStream will + } catch (IOException e1) { + e1.printStackTrace(); + } finally {try { + // shutdown data connection + dataStream.close(); // Closing the returned InputStream will closeDataSocket(); // close the associated socket. } catch (IOException e) { - } + e.printStackTrace(); + }} + // after stream is empty we should get control completion echo + reply = receive(); + //System.out.println("reply of LIST: " + reply); + // boolean success = !isNotPositiveCompletion(reply); + files.trimToSize(); return files; } @@ -1562,23 +1572,11 @@ public class FTPClient { */ private void closeConnection() throws IOException { // cleanup - if (ControlSocket != null) { - clientOutput.close(); - clientInput.close(); - ControlSocket.close(); - ControlSocket = null; - } - - if (DataSocketActive != null) { - DataSocketActive.close(); - DataSocketActive = null; - } - if (DataSocketPassive != null) { - DataSocketPassive.close(); - DataSocketPassive = null; // "Once a socket has been closed, it is - // not available for further networking - // use" - } + if (clientOutput != null) clientOutput.close(); + if (clientInput != null) clientInput.close(); + if (ControlSocket != null) ControlSocket.close(); + if (DataSocketActive != null) DataSocketActive.close(); + if (DataSocketPassive != null) DataSocketPassive.close(); } public boolean PROMPT() { @@ -2516,15 +2514,15 @@ public class FTPClient { * @throws IOException */ public static BlockingQueue sitelist(final String host, final int port) throws IOException { - final FTPClient c = new FTPClient(); - c.open(host, port); - c.login("anonymous", "anomic@"); + final FTPClient ftpClient = new FTPClient(); + ftpClient.open(host, port); + ftpClient.login("anonymous", "anomic@"); final LinkedBlockingQueue queue = new LinkedBlockingQueue(); new Thread() { public void run() { try { - sitelist(c, "/", queue); - c.quit(); + sitelist(ftpClient, "/", queue); + ftpClient.quit(); } catch (Exception e) {} finally { queue.add(POISON_entryInfo); } @@ -2532,24 +2530,27 @@ public class FTPClient { }.start(); return queue; } - private static void sitelist(final FTPClient c, String path, LinkedBlockingQueue queue) { + private static void sitelist(final FTPClient ftpClient, String path, LinkedBlockingQueue queue) { List list; try { - list = c.list(path, true); + list = ftpClient.list(path, true); } catch (IOException e) { + e.printStackTrace(); return; } if (!path.endsWith("/")) path += "/"; entryInfo info; for (final String line : list) { info = parseListData(line); - if (info != null) { - if (info.isDir) { - sitelist(c, path + info.name, queue); - } else { - if (!info.name.startsWith("/")) info.name = path + info.name; - queue.add(info); - } + if (info != null && info.type == filetype.file) { + if (!info.name.startsWith("/")) info.name = path + info.name; + queue.add(info); + } + } + for (final String line : list) { + info = parseListData(line); + if (info != null && info.type == filetype.directory) { + sitelist(ftpClient, path + info.name, queue); } } } @@ -2617,7 +2618,7 @@ public class FTPClient { // with link nameStart = line.indexOf(info.name); page.append(line.substring(0, nameStart)); - page.append("" + info.name + ""); + page.append("" + info.name + ""); nameEnd = nameStart + info.name.length(); if (line.length() > nameEnd) { page.append(line.substring(nameEnd)); @@ -2782,6 +2783,20 @@ public class FTPClient { } catch (final IOException e) { log.error(e); } + } else if (args[0].equals("-sitelist")) { + try { + final BlockingQueue q = sitelist(args[1], Integer.parseInt(args[2])); + entryInfo entry; + while ((entry = q.take()) != FTPClient.POISON_entryInfo) { + System.out.println(entry.toString()); + } + } catch (final FileNotFoundException e) { + log.error(e); + } catch (final IOException e) { + log.error(e); + } catch (InterruptedException e) { + log.error(e); + } } else { printHelp(); } @@ -2814,5 +2829,5 @@ public class FTPClient { printHelp(); } } - + } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index a3dfe8a16..e79d416bb 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -132,9 +132,31 @@ public class ContentScraper extends AbstractScraper implements Scraper { if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.'; //System.out.println("*** Appended dot: " + b.toString()); } + // find http links inside text + int p, q, s = 0; + String u; + MultiProtocolURI url; + while (s < b.length()) { + p = Math.min(find(b, "smb://", s), Math.min(find(b, "ftp://", s), Math.min(find(b, "http://", s), find(b, "https://", s)))); + if (p == Integer.MAX_VALUE) break; + q = b.indexOf(" ", p + 1); + u = b.substring(p, q < 0 ? b.length() : q); + s = p + 1; + try { + url = new MultiProtocolURI(u); + anchors.put(url, u); + continue; + } catch (MalformedURLException e) {} + } + // append string to content if (b.length() != 0) content.append(b).append(32); } + private static final int find(final String s, final String m, int start) { + int p = s.indexOf(m, start); + return (p < 0) ? Integer.MAX_VALUE : p; + } + private MultiProtocolURI absolutePath(final String relativePath) { try { return MultiProtocolURI.newURL(root, relativePath);