- (almost) fixed FTP crawler

- integrated/fixed SMB crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6742 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 35d0057cb0
commit 3300930fc5

@ -56,10 +56,11 @@ public class BlacklistTest_p {
if(post != null && post.containsKey("testList")) {
prop.put("testlist", "1");
String urlstring = post.get("testurl", "");
if(!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://")&&
!urlstring.startsWith("ftp://")
) urlstring = "http://"+urlstring;
if (!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://") &&
!urlstring.startsWith("ftp://") &&
!urlstring.startsWith("smb://") &&
!urlstring.startsWith("file://")) urlstring = "http://" + urlstring;
DigestURI testurl = null;
try {
testurl = new DigestURI(urlstring, null);

@ -84,7 +84,11 @@ public class Blacklist_p {
if(post.containsKey("testList")) {
prop.put("testlist", "1");
String urlstring = post.get("testurl", "");
if(!urlstring.startsWith("http://")) urlstring = "http://"+urlstring;
if(!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://") &&
!urlstring.startsWith("ftp://") &&
!urlstring.startsWith("smb://") &&
!urlstring.startsWith("file://")) urlstring = "http://"+urlstring;
DigestURI testurl = null;
try {
testurl = new DigestURI(urlstring, null);

@ -130,7 +130,10 @@ public class IndexControlURLs_p {
String urlhash = post.get("urlhash", "").trim();
if (!urlstring.startsWith("http://") &&
!urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; }
!urlstring.startsWith("https://") &&
!urlstring.startsWith("ftp://") &&
!urlstring.startsWith("smb://") &&
!urlstring.startsWith("file://")) { urlstring = "http://" + urlstring; }
prop.putHTML("urlstring", urlstring);
prop.putHTML("urlhash", urlhash);
@ -180,14 +183,15 @@ public class IndexControlURLs_p {
prop.put("urlhash", urlhash);
final URIMetadataRow entry = segment.urlMetadata().load(urlhash, null, 0);
if (entry == null) {
prop.putHTML("urlstring", "unknown url: " + urlstring);
prop.putHTML("result", "No Entry for URL " + url.toNormalform(true, true));
prop.putHTML("urlstring", urlstring);
prop.put("urlhash", "");
} else {
prop.putAll(genUrlProfile(segment, entry, urlhash));
prop.put("statistics", 0);
}
} catch (final MalformedURLException e) {
prop.putHTML("urlstring", "bad url: " + urlstring);
prop.putHTML("result", "bad url: " + urlstring);
prop.put("urlhash", "");
}
prop.put("lurlexport", 0);

@ -110,7 +110,7 @@ public class ViewFile {
prop.putHTML("error_words", "");
}
final String viewMode = post.get("viewMode","sentences");
final String viewMode = post.get("viewMode","parsed");
prop.put("error_vMode-" + viewMode, "1");
DigestURI url = null;

@ -38,7 +38,11 @@ public class getpageinfo_p {
prop.put("robots-allowed", "1");
prop.putXML("title", "FTP: "+url);
return prop;
} else if (!(url.toLowerCase().startsWith("http://") || url.toLowerCase().startsWith("https://"))) {
} else if (!url.startsWith("http://") &&
!url.startsWith("https://") &&
!url.startsWith("ftp://") &&
!url.startsWith("smb://") &&
!url.startsWith("file://")) {
url = "http://" + url;
}
if (actions.indexOf("title")>=0) {

@ -63,7 +63,7 @@ public class FTPLoader {
* @param request
* @return
*/
public Response load(final Request request) throws IOException {
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
long start = System.currentTimeMillis();
final DigestURI entryUrl = request.url();
@ -91,55 +91,58 @@ public class FTPLoader {
// stream for ftp-client errors
final ByteArrayOutputStream berr = new ByteArrayOutputStream();
final ftpc ftpClient = createFTPClient(berr);
// create new ftp client
final PrintStream err = new PrintStream(berr);
final ftpc ftpClient = new ftpc(System.in, null, err);
ftpClient.setDataTimeoutByMaxFilesize(maxFileSize);
// get a connection
if (openConnection(ftpClient, entryUrl)) {
// ftp stuff
//try {
// testing if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);
final boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
path = fullPath + "/";
file = "";
}
// test if the specified file is a directory
if (file.length() > 0) {
ftpClient.exec("cd \"" + path + "\"", false);
final boolean isFolder = ftpClient.isFolder(file);
if (isFolder) {
path = fullPath + "/";
file = "";
}
}
if (file.length() == 0) {
// directory -> get list of files
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
DigestURI u = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true, false));
}
byte[] dirList = generateDirlist(ftpClient, request, path);
if (file.length() == 0) {
// directory -> get list of files
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
DigestURI u = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (u != null) requestHeader.put(RequestHeader.REFERER, u.toNormalform(true, false));
}
StringBuilder dirList = ftpClient.dirhtml(path);
if (dirList == null) {
response = null;
} else {
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
dirList);
}
if (dirList == null) {
response = null;
} else {
// file -> download
try {
response = getFile(ftpClient, request);
} catch (final Exception e) {
// add message to errorLog
(new PrintStream(berr)).print(e.getMessage());
}
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
dirList.toString().getBytes());
}
} else {
// file -> download
try {
response = getFile(ftpClient, request, acceptOnlyParseable);
} catch (final Exception e) {
// add message to errorLog
(new PrintStream(berr)).print(e.getMessage());
}
}
closeConnection(ftpClient);
}
@ -166,11 +169,6 @@ public class FTPLoader {
/**
* establish a connection to the ftp server (open, login, set transfer mode)
*
* @param ftpClient
* @param hostname
* @param port
* @return success
*/
private boolean openConnection(final ftpc ftpClient, final DigestURI entryUrl) {
// get username and password
@ -209,61 +207,62 @@ public class FTPLoader {
return true;
}
/**
* @param ftpClient
* @param request
* @param htCache
* @param cacheFile
* @return
* @throws Exception
*/
private Response getFile(final ftpc ftpClient, final Request request) throws Exception {
private Response getFile(final ftpc ftpClient, final Request request, boolean acceptOnlyParseable) throws Exception {
// determine the mimetype of the resource
final DigestURI entryUrl = request.url();
final String mimeType = TextParser.mimeOf(entryUrl);
final String path = getPath(entryUrl);
final DigestURI url = request.url();
final String mime = TextParser.mimeOf(url);
final String path = getPath(url);
// if the mimetype and file extension is supported we start to download
// the file
Response response = null;
String supportError = TextParser.supports(entryUrl, mimeType);
if (supportError != null) {
// reject file
log.logInfo("PARSER REJECTED URL " + request.url().toString() + ": " + supportError);
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, supportError);
throw new Exception(supportError);
} else {
// abort the download if content is too long
final int size = ftpClient.fileSize(path);
if (size <= maxFileSize || maxFileSize == -1) {
// timeout for download
ftpClient.setDataTimeoutByMaxFilesize(size);
// determine the file date
final Date fileDate = ftpClient.entryDate(path);
// download the remote file
byte[] b = ftpClient.get(path);
// create a cache entry
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mimeType);
response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
// determine the file date
final Date fileDate = ftpClient.entryDate(path);
// create response header
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) requestHeader.put(RequestHeader.REFERER, sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()).toNormalform(true, false));
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(fileDate));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
// if the mimetype and file extension is supported we start to download the file
final int size = ftpClient.fileSize(path);
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
if (parserError != null) {
log.logInfo("No parser available in FTP crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new Exception("file size exceeds limit");
log.logInfo("Too big file in FTP crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
}
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
url.toNormalform(true, true).getBytes());
return response;
}
// timeout for download
ftpClient.setDataTimeoutByMaxFilesize(size);
// download the remote file
byte[] b = ftpClient.get(path);
// create a response
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
return response;
}
@ -277,45 +276,4 @@ public class FTPLoader {
return DigestURI.unescape(entryUrl.getPath()).replace("\"", "\"\"");
}
/**
* @param ftpClient
* @param entry
* @param cacheFile
* @return
*/
private byte[] generateDirlist(final ftpc ftpClient, final Request entry, final String path) {
// getting the dirlist
final DigestURI entryUrl = entry.url();
// generate the dirlist
final StringBuilder dirList = ftpClient.dirhtml(path);
if (dirList != null && dirList.length() > 0) {
try {
return dirList.toString().getBytes();
} catch (final Exception e) {
log.logInfo("Unable to write dirlist for URL " + entryUrl.toString());
}
}
return null;
}
/**
* create a new ftp client
*
* @param berr
* @return
*/
private ftpc createFTPClient(final ByteArrayOutputStream berr) {
// error
final PrintStream err = new PrintStream(berr);
final ftpc ftpClient = new ftpc(System.in, null, err);
// set timeout
ftpClient.setDataTimeoutByMaxFilesize(maxFileSize);
return ftpClient;
}
}

@ -1,27 +1,26 @@
//HTTPLoader.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2006
// HTTPLoader.java
// ---------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2006
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler.retrieval;

@ -336,7 +336,7 @@ public class Response {
// check status code
if (!validResponseStatus()) {
return "bad_status_" + this.responseStatus.substring(0, 3);
return "bad_status_" + this.responseStatus;
}
if (requestHeader != null) {
@ -737,7 +737,9 @@ public class Response {
public DigestURI referrerURL() {
if (requestHeader == null) return null;
try {
return new DigestURI(requestHeader.get(RequestHeader.REFERER, ""), null);
String r = requestHeader.get(RequestHeader.REFERER, null);
if (r == null) return null;
return new DigestURI(r, null);
} catch (final Exception e) {
return null;
}

@ -70,7 +70,7 @@ public class SMBLoader {
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
}
// process directories: transform them to html with meta robots=noindex (using the ftpc lib)
if (url.isDirectory()) {
List<String> list = new ArrayList<String>();
String u = url.toNormalform(true, true);
@ -92,29 +92,44 @@ public class SMBLoader {
request,
requestHeader,
responseHeader,
"OK",
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
content.toString().getBytes());
return response;
}
// check mime type and availability of parsers
// create response header
String mime = MimeTable.ext2mime(url.getFileExtension());
String parserError = null;
if (acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) {
// we know that we cannot process that file before loading
log.logInfo("no parser available (" + parserError + ") for url = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "no parser available (" + parserError + ") for url = " + request.url().toString());
throw new IOException("no parser available (" + parserError + ") for url = " + request.url().toString());
}
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
// check resource size
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size = url.length();
if (size > maxFileSize && maxFileSize >= 0) {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("file size = " + size + " exceeds limit");
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {
// we know that we cannot process that file before loading
// only the metadata is returned
if (parserError != null) {
log.logInfo("No parser available in SMB crawler: '" + parserError + "' for URL " + request.url().toString() + ": parsing only metadata");
} else {
log.logInfo("Too big file in SMB crawler with size = " + size + " Bytes for URL " + request.url().toString() + ": parsing only metadata");
}
// create response with metadata only
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain");
Response response = new Response(
request,
requestHeader,
responseHeader,
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
url.toNormalform(true, true).getBytes());
return response;
}
// load the resource
@ -122,15 +137,12 @@ public class SMBLoader {
byte[] b = FileUtils.read(is);
is.close();
// create response object
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
// create response with loaded content
Response response = new Response(
request,
requestHeader,
responseHeader,
"OK",
"200",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
return response;

@ -1037,7 +1037,7 @@ public class ftpc {
* analogous to the "list" command except that data shall be
* transferred over the control connection.
*/
send("STAT \"path\"");
send("STAT " + path);
final String reply = receive();
if (isNotPositiveCompletion(reply)) {

@ -1727,13 +1727,13 @@ public final class Switchboard extends serverSwitch {
if (condenser == null || document.indexingDenied()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
@ -1753,7 +1753,7 @@ public final class Switchboard extends serverSwitch {
RSSFeed.channels((queueEntry.initiator().equals(peers.mySeed().hash)) ? RSSFeed.LOCALINDEXING : RSSFeed.REMOTEINDEXING).addMessage(new RSSMessage("Indexed web page", dc_title, queueEntry.url().toNormalform(true, false)));
} catch (final IOException e) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage());
addURLtoErrorDB(queueEntry.url(), (referrerURL == null) ? "" : referrerURL.hash(), queueEntry.initiator(), dc_title, "error storing url: " + e.getMessage());
return;
}

@ -256,7 +256,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
if (protocol.equals("ftp")) response = ftpLoader.load(request);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (response != null) {
// we got something. Now check if we want to store that to the cache

Loading…
Cancel
Save