added a smb loader component for the YaCy crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6737 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent c77fbd0390
commit 9623d9e6d2

@ -684,6 +684,9 @@ crawler.http.maxFileSize=1048576
# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=1048576
# smb crawler specific settings: maximum size
crawler.smb.maxFileSize=50000000
# maximum number of crawler threads
crawler.MaxActiveThreads = 200

@ -156,19 +156,26 @@
<td class="settingsValue">#[crawler.clientTimeout]#</td>
</tr>
<tr>
<td colspan="2"><strong>Http Crawler Settings:</strong></td>
<td colspan="2"><strong>http Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td class="settingsValue">#[crawler.http.maxFileSize]#</td>
</tr>
<tr>
<td colspan="2"><strong>Ftp Crawler Settings:</strong></td>
<td colspan="2"><strong>ftp Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td class="settingsValue">#[crawler.ftp.maxFileSize]#</td>
</tr>
<tr>
<td colspan="2"><strong>smb Crawler Settings:</strong></td>
</tr>
<tr>
<td>Maximum Filesize:</td>
<td class="settingsValue">#[crawler.smb.maxFileSize]#</td>
</tr>
</table>
::<!-- 29: Crawler settings timeout error -->
<p class="error">Invalid crawler timeout value: <tt>#[crawler.clientTimeout]#</tt></p>

@ -467,7 +467,7 @@ public class SettingsAck_p {
// get maximum http file size
String maxSizeStr = post.get("crawler.http.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxHttpSize;
try {
@ -484,7 +484,7 @@ public class SettingsAck_p {
// get maximum ftp file size
maxSizeStr = post.get("crawler.ftp.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxFtpSize;
try {
@ -496,10 +496,24 @@ public class SettingsAck_p {
return prop;
}
maxSizeStr = post.get("crawler.smb.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxSmbSize;
try {
maxSmbSize = Integer.parseInt(maxSizeStr);
env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize));
} catch (final NumberFormatException e) {
prop.put("info", "31");
prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize"));
return prop;
}
// everything is ok
prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout));
prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize));
prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize));
prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
prop.put("info", "28");
return prop;
}

@ -201,6 +201,7 @@ public final class Settings_p {
prop.putHTML("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000"));
prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));
prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1"));
prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1"));
// return rewrite properties
return prop;

@ -54,7 +54,7 @@ public class FTPLoader {
public FTPLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
/**

@ -0,0 +1,139 @@
// SMBLoader.java
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 10.03.2010 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based search engine
//
// $LastChangedDate: 2010-03-07 00:41:51 +0100 (So, 07 Mrz 2010) $
// $LastChangedRevision: 6719 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler.retrieval;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import de.anomic.http.server.HeaderFramework;
import de.anomic.http.server.RequestHeader;
import de.anomic.http.server.ResponseHeader;
import de.anomic.net.ftpc;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.data.MimeTable;
import net.yacy.document.TextParser;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
public class SMBLoader {
private final Switchboard sb;
private final Log log;
private final int maxFileSize;
public SMBLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l);
}
public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
DigestURI url = request.url();
if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
RequestHeader requestHeader = new RequestHeader();
if (request.referrerhash() != null) {
DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
}
if (url.isDirectory()) {
List<String> list = new ArrayList<String>();
String u = url.toNormalform(true, true);
String[] l = url.list();
if (l == null) {
// this can only happen if there is no connection or the directory does not exist
log.logInfo("directory listing not available. URL = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "directory listing not available. URL = " + request.url().toString());
throw new IOException("directory listing not available. URL = " + request.url().toString());
}
for (String s: l) list.add(u + s);
StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
Response response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
content.toString().getBytes());
return response;
}
// check mime type and availability of parsers
String mime = MimeTable.ext2mime(url.getFileExtension());
String parserError = null;
if (acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) {
// we know that we cannot process that file before loading
log.logInfo("no parser available (" + parserError + ") for url = " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "no parser available (" + parserError + ") for url = " + request.url().toString());
throw new IOException("no parser available (" + parserError + ") for url = " + request.url().toString());
}
// check resource size
long size = url.length();
if (size > maxFileSize && maxFileSize >= 0) {
log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
throw new IOException("file size = " + size + " exceeds limit");
}
// load the resource
InputStream is = url.getInputStream();
byte[] b = FileUtils.read(is);
is.close();
// create response object
ResponseHeader responseHeader = new ResponseHeader();
responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
Response response = new Response(
request,
requestHeader,
responseHeader,
"OK",
sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
b);
return response;
}
}

@ -1113,7 +1113,7 @@ public class ftpc {
* @param line
* @return null if not parseable
*/
private entryInfo parseListData(final String line) {
private static entryInfo parseListData(final String line) {
final Pattern lsStyle = Pattern
.compile("^([-\\w]{10}).\\s*\\d+\\s+[-\\w]+\\s+[-\\w]+\\s+(\\d+)\\s+(\\w{3})\\s+(\\d+)\\s+(\\d+:?\\d*)\\s+(.*)$");
// groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6:
@ -1125,7 +1125,7 @@ public class ftpc {
try {
size = Integer.parseInt(tokens.group(2));
} catch (final NumberFormatException e) {
errPrintln("Error: not a number in list-entry: " + e.getMessage());
Log.logWarning("FTPC", "Error: not a number in list-entry: " + e.getMessage());
return null;
}
String time;
@ -1148,7 +1148,7 @@ public class ftpc {
date = lsDateFormat.parse(dateString);
}
} catch (final ParseException e) {
errPrintln(logPrefix + "---- Error: not ls date-format '" + dateString + "': " + e.getMessage());
Log.logWarning("FTPC", "---- Error: not ls date-format '" + dateString + "': " + e.getMessage());
date = new Date();
}
return new entryInfo(isDir, size, date, tokens.group(6));
@ -2594,7 +2594,7 @@ public class ftpc {
public StringBuilder dirhtml(String remotePath) {
// returns a directory listing using an existing connection
try {
if(isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) {
if (isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) {
remotePath += '/';
}
final List<String> list = list(remotePath, true);
@ -2605,7 +2605,7 @@ public class ftpc {
+ host + ((port == 21) ? "" : (":" + port)) + ((remotePath.length() > 0 && remotePath.charAt(0) == '/') ? "" : pwd() + "/")
+ remotePath;
return dirhtml(base, remotemessage, remotegreeting, remotesystem, list);
return dirhtml(base, remotemessage, remotegreeting, remotesystem, list, true);
} catch (final java.security.AccessControlException e) {
return null;
} catch (final IOException e) {
@ -2613,7 +2613,8 @@ public class ftpc {
}
}
public static StringBuilder dirhtml(final String host, final int port, final String remotePath,
public static StringBuilder dirhtml(
final String host, final int port, final String remotePath,
final String account, final String password) {
// opens a new connection and returns a directory listing as html
try {
@ -2631,8 +2632,10 @@ public class ftpc {
}
}
public StringBuilder dirhtml(final String base, final String servermessage, final String greeting,
final String system, final List<String> list) {
public static StringBuilder dirhtml(
final String base, final String servermessage, final String greeting,
final String system, final List<String> list,
final boolean metaRobotNoindex) {
// this creates the html output from collected strings
final StringBuilder page = new StringBuilder(1024);
final String title = "Index of " + base;
@ -2641,14 +2644,19 @@ public class ftpc {
page.append("<html><head>\n");
page.append(" <title>" + title + "</title>\n");
page.append(" <meta name=\"generator\" content=\"YaCy ftpc dirlisting\">\n");
if (metaRobotNoindex) {
page.append(" <meta name=\"robots\" content=\"noindex\">\n");
}
page.append(" <base href=\"" + base + "\">\n");
page.append("</head><body>\n");
page.append(" <h1>" + title + "</h1>\n");
page.append(" <p><pre>Server \"" + servermessage + "\" responded:\n");
page.append(" \n");
page.append(greeting);
page.append("\n");
page.append(" </pre></p>\n");
if (servermessage != null && greeting != null) {
page.append(" <p><pre>Server \"" + servermessage + "\" responded:\n");
page.append(" \n");
page.append(greeting);
page.append("\n");
page.append(" </pre></p>\n");
}
page.append(" <hr>\n");
page.append(" <pre>\n");
int nameStart, nameEnd;
@ -2661,7 +2669,7 @@ public class ftpc {
page.append(line.substring(0, nameStart));
page.append("<a href=\"" + base + info.name + ((info.isDir) ? "/" : "") + "\">" + info.name + "</a>");
nameEnd = nameStart + info.name.length();
if(line.length() > nameEnd) {
if (line.length() > nameEnd) {
page.append(line.substring(nameEnd));
}
} else {
@ -2672,7 +2680,7 @@ public class ftpc {
}
page.append(" </pre>\n");
page.append(" <hr>\n");
page.append(" <pre>System info: \"" + system + "\"</pre>\n");
if (system != null) page.append(" <pre>System info: \"" + system + "\"</pre>\n");
page.append("</body></html>\n");
return page;

@ -52,6 +52,7 @@ import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
@ -70,16 +71,18 @@ public final class LoaderDispatcher {
private final HashSet<String> supportedProtocols;
private final HTTPLoader httpLoader;
private final FTPLoader ftpLoader;
private final SMBLoader smbLoader;
private final Log log;
public LoaderDispatcher(final Switchboard sb) {
this.sb = sb;
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp"}));
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb"}));
// initiate loader objects
this.log = new Log("LOADER");
httpLoader = new HTTPLoader(sb, log);
ftpLoader = new FTPLoader(sb, log);
smbLoader = new SMBLoader(sb, log);
}
public boolean isSupportedProtocol(final String protocol) {
@ -254,6 +257,7 @@ public final class LoaderDispatcher {
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
if (protocol.equals("ftp")) response = ftpLoader.load(request);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (response != null) {
// we got something. Now check if we want to store that to the cache
String storeError = response.shallStoreCacheForCrawler();

Loading…
Cancel
Save