added a very fast ftp file list generator to site crawler:

- when a site-crawl for ftp sites is now started, then a special directory-tree harvester gets the complete directory structure of a ftp server at once
- the harvester runs concurrently and feeds into the normal crawl queue

also in this:
- fixed the 'start from file' crawl function
- added a link detector for the html parser. The html parser can now also extract links that are not included in <a> tags.
- this causes that a crawl start is now also possible from clear text link files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7367 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 4565b2f2c0
commit c36da90261

@ -127,7 +127,10 @@ public class Crawler_p {
String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
// add the prefix http:// if necessary // add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://"); int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart; if (pos == -1) {
if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
}
// normalize URL // normalize URL
DigestURI crawlingStartURL = null; DigestURI crawlingStartURL = null;
@ -148,6 +151,8 @@ public class Crawler_p {
newcrawlingMustMatch = "file://" + crawlingStartURL.getPath() + ".*"; newcrawlingMustMatch = "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) { } else if (crawlingStartURL.isSMB()) {
newcrawlingMustMatch = "smb://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*"; newcrawlingMustMatch = "smb://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isFTP()) {
newcrawlingMustMatch = "ftp://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
} else { } else {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
} }
@ -189,10 +194,10 @@ public class Crawler_p {
// store this call as api call // store this call as api call
if (repeat_time > 0) { if (repeat_time > 0) {
// store as scheduled api call // store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3));
} else { } else {
// store just a protocol // store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart));
} }
final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off")); final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
@ -225,7 +230,44 @@ public class Crawler_p {
env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
final String crawlingMode = post.get("crawlingMode","url"); final String crawlingMode = post.get("crawlingMode","url");
if ("url".equals(crawlingMode)) { if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final CrawlProfile profile = new CrawlProfile(
crawlingStart,
crawlingStartURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final DigestURI url = crawlingStartURL;
sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false);
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if ("url".equals(crawlingMode)) {
// check if pattern matches // check if pattern matches
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
@ -334,12 +376,12 @@ public class Crawler_p {
reasonString); reasonString);
} }
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
prop.put("info", "6");//Error with url prop.put("info", "6"); // Error with url
prop.putHTML("info_crawlingStart", crawlingStart); prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
Log.logException(e); Log.logException(e);
@ -378,32 +420,14 @@ public class Crawler_p {
cachePolicy); cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator(); sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
} catch (final PatternSyntaxException e) { } catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) { } catch (final Exception e) {
// mist // mist
prop.put("info", "7");//Error with file prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", fileName); prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage()); prop.putHTML("info_error", e.getMessage());
Log.logException(e); Log.logException(e);

@ -74,5 +74,6 @@ function loadInfos() {
document.getElementById("ajax").setAttribute("src",AJAX_ON); document.getElementById("ajax").setAttribute("src",AJAX_ON);
url=document.getElementById("crawlingURL").value; url=document.getElementById("crawlingURL").value;
if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false;
sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url); sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url);
} }

@ -28,12 +28,17 @@
package de.anomic.crawler; package de.anomic.crawler;
import java.io.IOException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import java.util.Date; import java.util.Date;
import java.util.Map; import java.util.Map;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow; import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -131,15 +136,7 @@ public final class CrawlStacker {
// we just don't know anything about that host // we just don't know anything about that host
return false; return false;
} }
/*
public boolean job() {
if (this.fastQueue.queueSize() > 0 && job(this.fastQueue)) return true;
if (this.slowQueue.queueSize() == 0) return false;
return job(this.slowQueue);
}
*/
public Request job(Request entry) { public Request job(Request entry) {
// this is the method that is called by the busy thread from outside // this is the method that is called by the busy thread from outside
if (entry == null) return null; if (entry == null) return null;
@ -180,6 +177,81 @@ public final class CrawlStacker {
} }
} }
public void queueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, String> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI(e.getKey());
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
this.nextQueue.noticeURL.removeByURLHash(urlhash);
this.nextQueue.errorURL.remove(urlhash);
}
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
e.getValue(),
new Date(),
profileHandle,
0,
0,
0
));
}
}
public void queueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) {
final CrawlQueues cq = this.nextQueue;
new Thread() {
public void run() {
BlockingQueue<FTPClient.entryInfo> queue;
try {
queue = FTPClient.sitelist(host, port);
FTPClient.entryInfo entry;
while ((entry = queue.take()) != FTPClient.POISON_entryInfo) {
// delete old entry, if exists to force a re-load of the url (thats wanted here)
DigestURI url = null;
try {
if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name);
else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name);
else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name);
else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name);
} catch (MalformedURLException e) {
continue;
}
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
cq.noticeURL.removeByURLHash(urlhash);
cq.errorURL.remove(urlhash);
}
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
entry.name,
entry.date,
profileHandle,
0,
0,
0
));
}
} catch (IOException e1) {
} catch (InterruptedException e) {
}
}
}.start();
}
public String stackCrawl(final Request entry) { public String stackCrawl(final Request entry) {
// stacks a crawl item. The position can also be remote // stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful // returns null if successful, a reason string if not successful

@ -617,7 +617,7 @@ public class FTPClient {
// /// try to parse LIST output (1 command) // /// try to parse LIST output (1 command)
final entryInfo info = fileInfo(path); final entryInfo info = fileInfo(path);
if (info != null) { if (info != null) {
return info.isDir; return info.type == filetype.directory;
} }
// /// try to change to folder (4 commands) // /// try to change to folder (4 commands)
@ -1045,7 +1045,9 @@ public class FTPClient {
// groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name // groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name
final Matcher tokens = lsStyle.matcher(line); final Matcher tokens = lsStyle.matcher(line);
if (tokens.matches()) { if (tokens.matches()) {
final boolean isDir = tokens.group(1).startsWith("d"); filetype type = filetype.file;
if (tokens.group(1).startsWith("d")) type = filetype.directory;
if (tokens.group(1).startsWith("l")) type = filetype.link;
int size = -1; int size = -1;
try { try {
size = Integer.parseInt(tokens.group(2)); size = Integer.parseInt(tokens.group(2));
@ -1076,7 +1078,7 @@ public class FTPClient {
log.warn("---- Error: not ls date-format '" + dateString, e); log.warn("---- Error: not ls date-format '" + dateString, e);
date = new Date(); date = new Date();
} }
return new entryInfo(isDir, size, date, tokens.group(6)); return new entryInfo(type, size, date, tokens.group(6));
} }
return null; return null;
} }
@ -1084,6 +1086,10 @@ public class FTPClient {
public static final entryInfo POISON_entryInfo = new entryInfo(); public static final entryInfo POISON_entryInfo = new entryInfo();
public static enum filetype {
file, link, directory;
}
/** /**
* parameter class * parameter class
* *
@ -1092,9 +1098,9 @@ public class FTPClient {
*/ */
public static class entryInfo { public static class entryInfo {
/** /**
* is this a directory? * file type
*/ */
public final boolean isDir; public final filetype type;
/** /**
* size in bytes * size in bytes
*/ */
@ -1109,7 +1115,7 @@ public class FTPClient {
public String name; public String name;
public entryInfo() { public entryInfo() {
this.isDir = false; this.type = filetype.file;
this.size = -1; this.size = -1;
this.date = null; this.date = null;
this.name = null; this.name = null;
@ -1124,8 +1130,8 @@ public class FTPClient {
* @param date * @param date
* @param name * @param name
*/ */
public entryInfo(final boolean isDir, final int size, final Date date, final String name) { public entryInfo(final filetype type, final int size, final Date date, final String name) {
this.isDir = isDir; this.type = type;
this.size = size; this.size = size;
this.date = date; this.date = date;
this.name = name; this.name = name;
@ -1139,8 +1145,8 @@ public class FTPClient {
public String toString() { public String toString() {
final StringBuilder info = new StringBuilder(100); final StringBuilder info = new StringBuilder(100);
info.append(name); info.append(name);
info.append(" (isDir="); info.append(" (type=");
info.append(isDir); info.append(type.name());
info.append(", size="); info.append(", size=");
info.append(size); info.append(size);
info.append(", "); info.append(", ");
@ -1349,28 +1355,32 @@ public class FTPClient {
} }
// starting data transaction // starting data transaction
final Socket data = getDataSocket(); final Socket dataSocket = getDataSocket();
final BufferedReader ClientStream = new BufferedReader(new InputStreamReader(data.getInputStream())); final BufferedReader dataStream = new BufferedReader(new InputStreamReader(dataSocket.getInputStream()));
// read file system data // read file system data
String line; String line;
final ArrayList<String> files = new ArrayList<String>(); final ArrayList<String> files = new ArrayList<String>();
try { try {
while ((line = ClientStream.readLine()) != null) { while ((line = dataStream.readLine()) != null) {
if (!line.startsWith("total ")) { if (!line.startsWith("total ")) {
files.add(line); files.add(line);
} }
} }
// after stream is empty we should get control completion echo } catch (IOException e1) {
/*reply =*/ receive(); e1.printStackTrace();
} finally {try {
// boolean success = !isNotPositiveCompletion(reply); // shutdown data connection
dataStream.close(); // Closing the returned InputStream will
// shutdown connection
ClientStream.close(); // Closing the returned InputStream will
closeDataSocket(); // close the associated socket. closeDataSocket(); // close the associated socket.
} catch (IOException e) { } catch (IOException e) {
} e.printStackTrace();
}}
// after stream is empty we should get control completion echo
reply = receive();
//System.out.println("reply of LIST: " + reply);
// boolean success = !isNotPositiveCompletion(reply);
files.trimToSize(); files.trimToSize();
return files; return files;
} }
@ -1562,23 +1572,11 @@ public class FTPClient {
*/ */
private void closeConnection() throws IOException { private void closeConnection() throws IOException {
// cleanup // cleanup
if (ControlSocket != null) { if (clientOutput != null) clientOutput.close();
clientOutput.close(); if (clientInput != null) clientInput.close();
clientInput.close(); if (ControlSocket != null) ControlSocket.close();
ControlSocket.close(); if (DataSocketActive != null) DataSocketActive.close();
ControlSocket = null; if (DataSocketPassive != null) DataSocketPassive.close();
}
if (DataSocketActive != null) {
DataSocketActive.close();
DataSocketActive = null;
}
if (DataSocketPassive != null) {
DataSocketPassive.close();
DataSocketPassive = null; // "Once a socket has been closed, it is
// not available for further networking
// use"
}
} }
public boolean PROMPT() { public boolean PROMPT() {
@ -2516,15 +2514,15 @@ public class FTPClient {
* @throws IOException * @throws IOException
*/ */
public static BlockingQueue<entryInfo> sitelist(final String host, final int port) throws IOException { public static BlockingQueue<entryInfo> sitelist(final String host, final int port) throws IOException {
final FTPClient c = new FTPClient(); final FTPClient ftpClient = new FTPClient();
c.open(host, port); ftpClient.open(host, port);
c.login("anonymous", "anomic@"); ftpClient.login("anonymous", "anomic@");
final LinkedBlockingQueue<entryInfo> queue = new LinkedBlockingQueue<entryInfo>(); final LinkedBlockingQueue<entryInfo> queue = new LinkedBlockingQueue<entryInfo>();
new Thread() { new Thread() {
public void run() { public void run() {
try { try {
sitelist(c, "/", queue); sitelist(ftpClient, "/", queue);
c.quit(); ftpClient.quit();
} catch (Exception e) {} finally { } catch (Exception e) {} finally {
queue.add(POISON_entryInfo); queue.add(POISON_entryInfo);
} }
@ -2532,24 +2530,27 @@ public class FTPClient {
}.start(); }.start();
return queue; return queue;
} }
private static void sitelist(final FTPClient c, String path, LinkedBlockingQueue<entryInfo> queue) { private static void sitelist(final FTPClient ftpClient, String path, LinkedBlockingQueue<entryInfo> queue) {
List<String> list; List<String> list;
try { try {
list = c.list(path, true); list = ftpClient.list(path, true);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace();
return; return;
} }
if (!path.endsWith("/")) path += "/"; if (!path.endsWith("/")) path += "/";
entryInfo info; entryInfo info;
for (final String line : list) { for (final String line : list) {
info = parseListData(line); info = parseListData(line);
if (info != null) { if (info != null && info.type == filetype.file) {
if (info.isDir) { if (!info.name.startsWith("/")) info.name = path + info.name;
sitelist(c, path + info.name, queue); queue.add(info);
} else { }
if (!info.name.startsWith("/")) info.name = path + info.name; }
queue.add(info); for (final String line : list) {
} info = parseListData(line);
if (info != null && info.type == filetype.directory) {
sitelist(ftpClient, path + info.name, queue);
} }
} }
} }
@ -2617,7 +2618,7 @@ public class FTPClient {
// with link // with link
nameStart = line.indexOf(info.name); nameStart = line.indexOf(info.name);
page.append(line.substring(0, nameStart)); page.append(line.substring(0, nameStart));
page.append("<a href=\"" + base + info.name + ((info.isDir) ? "/" : "") + "\">" + info.name + "</a>"); page.append("<a href=\"" + base + info.name + ((info.type == filetype.directory) ? "/" : "") + "\">" + info.name + "</a>");
nameEnd = nameStart + info.name.length(); nameEnd = nameStart + info.name.length();
if (line.length() > nameEnd) { if (line.length() > nameEnd) {
page.append(line.substring(nameEnd)); page.append(line.substring(nameEnd));
@ -2782,6 +2783,20 @@ public class FTPClient {
} catch (final IOException e) { } catch (final IOException e) {
log.error(e); log.error(e);
} }
} else if (args[0].equals("-sitelist")) {
try {
final BlockingQueue<entryInfo> q = sitelist(args[1], Integer.parseInt(args[2]));
entryInfo entry;
while ((entry = q.take()) != FTPClient.POISON_entryInfo) {
System.out.println(entry.toString());
}
} catch (final FileNotFoundException e) {
log.error(e);
} catch (final IOException e) {
log.error(e);
} catch (InterruptedException e) {
log.error(e);
}
} else { } else {
printHelp(); printHelp();
} }
@ -2814,5 +2829,5 @@ public class FTPClient {
printHelp(); printHelp();
} }
} }
} }

@ -132,9 +132,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.'; if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.';
//System.out.println("*** Appended dot: " + b.toString()); //System.out.println("*** Appended dot: " + b.toString());
} }
// find http links inside text
int p, q, s = 0;
String u;
MultiProtocolURI url;
while (s < b.length()) {
p = Math.min(find(b, "smb://", s), Math.min(find(b, "ftp://", s), Math.min(find(b, "http://", s), find(b, "https://", s))));
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
s = p + 1;
try {
url = new MultiProtocolURI(u);
anchors.put(url, u);
continue;
} catch (MalformedURLException e) {}
}
// append string to content
if (b.length() != 0) content.append(b).append(32); if (b.length() != 0) content.append(b).append(32);
} }
private static final int find(final String s, final String m, int start) {
int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
private MultiProtocolURI absolutePath(final String relativePath) { private MultiProtocolURI absolutePath(final String relativePath) {
try { try {
return MultiProtocolURI.newURL(root, relativePath); return MultiProtocolURI.newURL(root, relativePath);

Loading…
Cancel
Save