added a very fast ftp file list generator to site crawler:

- when a site-crawl for ftp sites is now started, then a special directory-tree harvester gets the complete directory structure of a ftp server at once
- the harvester runs concurrently and feeds into the normal crawl queue

also in this:
- fixed the 'start from file' crawl function
- added a link detector for the html parser. The html parser can now also extract links that are not included in <a> tags.
- this causes that a crawl start is now also possible from clear text link files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7367 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 4565b2f2c0
commit c36da90261

@ -127,7 +127,10 @@ public class Crawler_p {
String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
// add the prefix http:// if necessary
int pos = crawlingStart.indexOf("://");
if (pos == -1) crawlingStart = "http://" + crawlingStart;
if (pos == -1) {
if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart;
if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart;
}
// normalize URL
DigestURI crawlingStartURL = null;
@ -148,6 +151,8 @@ public class Crawler_p {
newcrawlingMustMatch = "file://" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isSMB()) {
newcrawlingMustMatch = "smb://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
} else if (crawlingStartURL.isFTP()) {
newcrawlingMustMatch = "ftp://.*" + crawlingStartURL.getHost() + ".*" + crawlingStartURL.getPath() + ".*";
} else {
newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
}
@ -189,10 +194,10 @@ public class Crawler_p {
// store this call as api call
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart));
}
final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off"));
@ -225,7 +230,44 @@ public class Crawler_p {
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
final String crawlingMode = post.get("crawlingMode","url");
if ("url".equals(crawlingMode)) {
if (crawlingStart != null && crawlingStart.startsWith("ftp")) {
try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
final CrawlProfile profile = new CrawlProfile(
crawlingStart,
crawlingStartURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ,
indexText,
indexMedia,
storeHTCache,
crawlOrder,
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final DigestURI url = crawlingStartURL;
sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), "ftp", url.getHost(), url.getPort(), false);
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} else if ("url".equals(crawlingMode)) {
// check if pattern matches
if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
@ -334,12 +376,12 @@ public class Crawler_p {
reasonString);
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.put("info", "6"); // Error with url
prop.putHTML("info_crawlingStart", crawlingStart);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
@ -378,32 +420,14 @@ public class Crawler_p {
cachePolicy);
sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, String>> linkiterator = hyperlinks.entrySet().iterator();
DigestURI nexturl;
while (linkiterator.hasNext()) {
final Map.Entry<MultiProtocolURI, String> e = linkiterator.next();
if (e.getKey() == null) continue;
nexturl = new DigestURI(e.getKey());
sb.crawlStacker.enqueueEntry(new Request(
sb.peers.mySeed().hash.getBytes(),
nexturl,
null,
e.getValue(),
new Date(),
profile.handle(),
0,
0,
0
));
}
sb.crawlStacker.queueEntries(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks, true);
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7");//Error with file
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);

@ -74,5 +74,6 @@ function loadInfos() {
document.getElementById("ajax").setAttribute("src",AJAX_ON);
url=document.getElementById("crawlingURL").value;
if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false;
sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url);
}

@ -28,12 +28,17 @@
package de.anomic.crawler;
import java.io.IOException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.Map;
import java.util.concurrent.BlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.logging.Log;
@ -131,15 +136,7 @@ public final class CrawlStacker {
// we just don't know anything about that host
return false;
}
/*
public boolean job() {
if (this.fastQueue.queueSize() > 0 && job(this.fastQueue)) return true;
if (this.slowQueue.queueSize() == 0) return false;
return job(this.slowQueue);
}
*/
public Request job(Request entry) {
// this is the method that is called by the busy thread from outside
if (entry == null) return null;
@ -180,6 +177,81 @@ public final class CrawlStacker {
}
}
public void queueEntries(byte[] initiator, String profileHandle, Map<MultiProtocolURI, String> hyperlinks, boolean replace) {
for (Map.Entry<MultiProtocolURI, String> e: hyperlinks.entrySet()) {
if (e.getKey() == null) continue;
// delete old entry, if exists to force a re-load of the url (thats wanted here)
final DigestURI url = new DigestURI(e.getKey());
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
this.nextQueue.noticeURL.removeByURLHash(urlhash);
this.nextQueue.errorURL.remove(urlhash);
}
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
e.getValue(),
new Date(),
profileHandle,
0,
0,
0
));
}
}
public void queueEntries(final byte[] initiator, final String profileHandle, final String protocol, final String host, final int port, final boolean replace) {
final CrawlQueues cq = this.nextQueue;
new Thread() {
public void run() {
BlockingQueue<FTPClient.entryInfo> queue;
try {
queue = FTPClient.sitelist(host, port);
FTPClient.entryInfo entry;
while ((entry = queue.take()) != FTPClient.POISON_entryInfo) {
// delete old entry, if exists to force a re-load of the url (thats wanted here)
DigestURI url = null;
try {
if (protocol.equals("ftp")) url = new DigestURI("ftp://" + host + (port == 21 ? "" : ":" + port) + entry.name);
else if (protocol.equals("smb")) url = new DigestURI("smb://" + host + entry.name);
else if (protocol.equals("http")) url = new DigestURI("http://" + host + (port == 80 ? "" : ":" + port) + entry.name);
else if (protocol.equals("https")) url = new DigestURI("https://" + host + (port == 443 ? "" : ":" + port) + entry.name);
} catch (MalformedURLException e) {
continue;
}
final byte[] urlhash = url.hash();
if (replace) {
indexSegment.urlMetadata().remove(urlhash);
cq.noticeURL.removeByURLHash(urlhash);
cq.errorURL.remove(urlhash);
}
// put entry on crawl stack
enqueueEntry(new Request(
initiator,
url,
null,
entry.name,
entry.date,
profileHandle,
0,
0,
0
));
}
} catch (IOException e1) {
} catch (InterruptedException e) {
}
}
}.start();
}
public String stackCrawl(final Request entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful

@ -617,7 +617,7 @@ public class FTPClient {
// /// try to parse LIST output (1 command)
final entryInfo info = fileInfo(path);
if (info != null) {
return info.isDir;
return info.type == filetype.directory;
}
// /// try to change to folder (4 commands)
@ -1045,7 +1045,9 @@ public class FTPClient {
// groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: name
final Matcher tokens = lsStyle.matcher(line);
if (tokens.matches()) {
final boolean isDir = tokens.group(1).startsWith("d");
filetype type = filetype.file;
if (tokens.group(1).startsWith("d")) type = filetype.directory;
if (tokens.group(1).startsWith("l")) type = filetype.link;
int size = -1;
try {
size = Integer.parseInt(tokens.group(2));
@ -1076,7 +1078,7 @@ public class FTPClient {
log.warn("---- Error: not ls date-format '" + dateString, e);
date = new Date();
}
return new entryInfo(isDir, size, date, tokens.group(6));
return new entryInfo(type, size, date, tokens.group(6));
}
return null;
}
@ -1084,6 +1086,10 @@ public class FTPClient {
public static final entryInfo POISON_entryInfo = new entryInfo();
public static enum filetype {
file, link, directory;
}
/**
* parameter class
*
@ -1092,9 +1098,9 @@ public class FTPClient {
*/
public static class entryInfo {
/**
* is this a directory?
* file type
*/
public final boolean isDir;
public final filetype type;
/**
* size in bytes
*/
@ -1109,7 +1115,7 @@ public class FTPClient {
public String name;
public entryInfo() {
this.isDir = false;
this.type = filetype.file;
this.size = -1;
this.date = null;
this.name = null;
@ -1124,8 +1130,8 @@ public class FTPClient {
* @param date
* @param name
*/
public entryInfo(final boolean isDir, final int size, final Date date, final String name) {
this.isDir = isDir;
public entryInfo(final filetype type, final int size, final Date date, final String name) {
this.type = type;
this.size = size;
this.date = date;
this.name = name;
@ -1139,8 +1145,8 @@ public class FTPClient {
public String toString() {
final StringBuilder info = new StringBuilder(100);
info.append(name);
info.append(" (isDir=");
info.append(isDir);
info.append(" (type=");
info.append(type.name());
info.append(", size=");
info.append(size);
info.append(", ");
@ -1349,28 +1355,32 @@ public class FTPClient {
}
// starting data transaction
final Socket data = getDataSocket();
final BufferedReader ClientStream = new BufferedReader(new InputStreamReader(data.getInputStream()));
final Socket dataSocket = getDataSocket();
final BufferedReader dataStream = new BufferedReader(new InputStreamReader(dataSocket.getInputStream()));
// read file system data
String line;
final ArrayList<String> files = new ArrayList<String>();
try {
while ((line = ClientStream.readLine()) != null) {
while ((line = dataStream.readLine()) != null) {
if (!line.startsWith("total ")) {
files.add(line);
}
}
// after stream is empty we should get control completion echo
/*reply =*/ receive();
// boolean success = !isNotPositiveCompletion(reply);
// shutdown connection
ClientStream.close(); // Closing the returned InputStream will
} catch (IOException e1) {
e1.printStackTrace();
} finally {try {
// shutdown data connection
dataStream.close(); // Closing the returned InputStream will
closeDataSocket(); // close the associated socket.
} catch (IOException e) {
}
e.printStackTrace();
}}
// after stream is empty we should get control completion echo
reply = receive();
//System.out.println("reply of LIST: " + reply);
// boolean success = !isNotPositiveCompletion(reply);
files.trimToSize();
return files;
}
@ -1562,23 +1572,11 @@ public class FTPClient {
*/
private void closeConnection() throws IOException {
// cleanup
if (ControlSocket != null) {
clientOutput.close();
clientInput.close();
ControlSocket.close();
ControlSocket = null;
}
if (DataSocketActive != null) {
DataSocketActive.close();
DataSocketActive = null;
}
if (DataSocketPassive != null) {
DataSocketPassive.close();
DataSocketPassive = null; // "Once a socket has been closed, it is
// not available for further networking
// use"
}
if (clientOutput != null) clientOutput.close();
if (clientInput != null) clientInput.close();
if (ControlSocket != null) ControlSocket.close();
if (DataSocketActive != null) DataSocketActive.close();
if (DataSocketPassive != null) DataSocketPassive.close();
}
public boolean PROMPT() {
@ -2516,15 +2514,15 @@ public class FTPClient {
* @throws IOException
*/
public static BlockingQueue<entryInfo> sitelist(final String host, final int port) throws IOException {
final FTPClient c = new FTPClient();
c.open(host, port);
c.login("anonymous", "anomic@");
final FTPClient ftpClient = new FTPClient();
ftpClient.open(host, port);
ftpClient.login("anonymous", "anomic@");
final LinkedBlockingQueue<entryInfo> queue = new LinkedBlockingQueue<entryInfo>();
new Thread() {
public void run() {
try {
sitelist(c, "/", queue);
c.quit();
sitelist(ftpClient, "/", queue);
ftpClient.quit();
} catch (Exception e) {} finally {
queue.add(POISON_entryInfo);
}
@ -2532,24 +2530,27 @@ public class FTPClient {
}.start();
return queue;
}
private static void sitelist(final FTPClient c, String path, LinkedBlockingQueue<entryInfo> queue) {
private static void sitelist(final FTPClient ftpClient, String path, LinkedBlockingQueue<entryInfo> queue) {
List<String> list;
try {
list = c.list(path, true);
list = ftpClient.list(path, true);
} catch (IOException e) {
e.printStackTrace();
return;
}
if (!path.endsWith("/")) path += "/";
entryInfo info;
for (final String line : list) {
info = parseListData(line);
if (info != null) {
if (info.isDir) {
sitelist(c, path + info.name, queue);
} else {
if (!info.name.startsWith("/")) info.name = path + info.name;
queue.add(info);
}
if (info != null && info.type == filetype.file) {
if (!info.name.startsWith("/")) info.name = path + info.name;
queue.add(info);
}
}
for (final String line : list) {
info = parseListData(line);
if (info != null && info.type == filetype.directory) {
sitelist(ftpClient, path + info.name, queue);
}
}
}
@ -2617,7 +2618,7 @@ public class FTPClient {
// with link
nameStart = line.indexOf(info.name);
page.append(line.substring(0, nameStart));
page.append("<a href=\"" + base + info.name + ((info.isDir) ? "/" : "") + "\">" + info.name + "</a>");
page.append("<a href=\"" + base + info.name + ((info.type == filetype.directory) ? "/" : "") + "\">" + info.name + "</a>");
nameEnd = nameStart + info.name.length();
if (line.length() > nameEnd) {
page.append(line.substring(nameEnd));
@ -2782,6 +2783,20 @@ public class FTPClient {
} catch (final IOException e) {
log.error(e);
}
} else if (args[0].equals("-sitelist")) {
try {
final BlockingQueue<entryInfo> q = sitelist(args[1], Integer.parseInt(args[2]));
entryInfo entry;
while ((entry = q.take()) != FTPClient.POISON_entryInfo) {
System.out.println(entry.toString());
}
} catch (final FileNotFoundException e) {
log.error(e);
} catch (final IOException e) {
log.error(e);
} catch (InterruptedException e) {
log.error(e);
}
} else {
printHelp();
}
@ -2814,5 +2829,5 @@ public class FTPClient {
printHelp();
}
}
}

@ -132,9 +132,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ((b.length() != 0) && (!(punctuation(b.charAt(b.length() - 1))))) b = b + '.';
//System.out.println("*** Appended dot: " + b.toString());
}
// find http links inside text
int p, q, s = 0;
String u;
MultiProtocolURI url;
while (s < b.length()) {
p = Math.min(find(b, "smb://", s), Math.min(find(b, "ftp://", s), Math.min(find(b, "http://", s), find(b, "https://", s))));
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
s = p + 1;
try {
url = new MultiProtocolURI(u);
anchors.put(url, u);
continue;
} catch (MalformedURLException e) {}
}
// append string to content
if (b.length() != 0) content.append(b).append(32);
}
private static final int find(final String s, final String m, int start) {
int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
private MultiProtocolURI absolutePath(final String relativePath) {
try {
return MultiProtocolURI.newURL(root, relativePath);

Loading…
Cancel
Save