From 9623d9e6d28aee8b169e3ce14075269e46833675 Mon Sep 17 00:00:00 2001
From: orbiter
Date: Wed, 10 Mar 2010 08:55:29 +0000
Subject: [PATCH] added a smb loader component for the YaCy crawler
git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6737 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
defaults/yacy.init | 3 +
htroot/SettingsAck_p.html | 11 +-
htroot/SettingsAck_p.java | 18 ++-
htroot/Settings_p.java | 1 +
.../anomic/crawler/retrieval/FTPLoader.java | 2 +-
.../anomic/crawler/retrieval/SMBLoader.java | 139 ++++++++++++++++++
source/de/anomic/net/ftpc.java | 38 +++--
.../net/yacy/repository/LoaderDispatcher.java | 6 +-
8 files changed, 197 insertions(+), 21 deletions(-)
create mode 100644 source/de/anomic/crawler/retrieval/SMBLoader.java
diff --git a/defaults/yacy.init b/defaults/yacy.init
index b25073ab4..1ab3c95d4 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -684,6 +684,9 @@ crawler.http.maxFileSize=1048576
# ftp crawler specific settings; size in bytes
crawler.ftp.maxFileSize=1048576
+# smb crawler specific settings: maximum size
+crawler.smb.maxFileSize=50000000
+
# maximum number of crawler threads
crawler.MaxActiveThreads = 200
diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html
index db013c05a..4be5bcb58 100644
--- a/htroot/SettingsAck_p.html
+++ b/htroot/SettingsAck_p.html
@@ -156,19 +156,26 @@
#[crawler.clientTimeout]# |
- Http Crawler Settings: |
+ http Crawler Settings: |
Maximum Filesize: |
#[crawler.http.maxFileSize]# |
- Ftp Crawler Settings: |
+ ftp Crawler Settings: |
Maximum Filesize: |
#[crawler.ftp.maxFileSize]# |
+
+ smb Crawler Settings: |
+
+
+ Maximum Filesize: |
+ #[crawler.smb.maxFileSize]# |
+
::
Invalid crawler timeout value: #[crawler.clientTimeout]#
diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java
index aa2a50764..dbd390221 100644
--- a/htroot/SettingsAck_p.java
+++ b/htroot/SettingsAck_p.java
@@ -467,7 +467,7 @@ public class SettingsAck_p {
// get maximum http file size
String maxSizeStr = post.get("crawler.http.maxFileSize");
- if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
+ if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxHttpSize;
try {
@@ -484,7 +484,7 @@ public class SettingsAck_p {
// get maximum ftp file size
maxSizeStr = post.get("crawler.ftp.maxFileSize");
- if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
+ if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
long maxFtpSize;
try {
@@ -496,10 +496,24 @@ public class SettingsAck_p {
return prop;
}
+ maxSizeStr = post.get("crawler.smb.maxFileSize");
+ if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1";
+
+ long maxSmbSize;
+ try {
+ maxSmbSize = Integer.parseInt(maxSizeStr);
+ env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize));
+ } catch (final NumberFormatException e) {
+ prop.put("info", "31");
+ prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize"));
+ return prop;
+ }
+
// everything is ok
prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout));
prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize));
prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize));
+ prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize));
prop.put("info", "28");
return prop;
}
diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java
index 87c832040..1c2998d95 100644
--- a/htroot/Settings_p.java
+++ b/htroot/Settings_p.java
@@ -201,6 +201,7 @@ public final class Settings_p {
prop.putHTML("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000"));
prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));
prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1"));
+ prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1"));
// return rewrite properties
return prop;
diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java
index 1b7adc4ca..565ad5e0c 100644
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@@ -54,7 +54,7 @@ public class FTPLoader {
public FTPLoader(final Switchboard sb, final Log log) {
this.sb = sb;
this.log = log;
- maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
+ this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l);
}
/**
diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java
new file mode 100644
index 000000000..a6d242636
--- /dev/null
+++ b/source/de/anomic/crawler/retrieval/SMBLoader.java
@@ -0,0 +1,139 @@
+// SMBLoader.java
+// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 10.03.2010 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based search engine
+//
+// $LastChangedDate: 2010-03-07 00:41:51 +0100 (So, 07 Mrz 2010) $
+// $LastChangedRevision: 6719 $
+// $LastChangedBy: orbiter $
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+package de.anomic.crawler.retrieval;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+import de.anomic.http.server.HeaderFramework;
+import de.anomic.http.server.RequestHeader;
+import de.anomic.http.server.ResponseHeader;
+import de.anomic.net.ftpc;
+import de.anomic.search.Segments;
+import de.anomic.search.Switchboard;
+import de.anomic.data.MimeTable;
+
+import net.yacy.document.TextParser;
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
+import net.yacy.kelondro.util.DateFormatter;
+import net.yacy.kelondro.util.FileUtils;
+
+public class SMBLoader {
+
+ private final Switchboard sb;
+ private final Log log;
+ private final int maxFileSize;
+
+ public SMBLoader(final Switchboard sb, final Log log) {
+ this.sb = sb;
+ this.log = log;
+ maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l);
+ }
+
+
+ public Response load(final Request request, boolean acceptOnlyParseable) throws IOException {
+ DigestURI url = request.url();
+ if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol());
+
+ RequestHeader requestHeader = new RequestHeader();
+ if (request.referrerhash() != null) {
+ DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
+ if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false));
+ }
+
+
+ if (url.isDirectory()) {
+ List list = new ArrayList();
+ String u = url.toNormalform(true, true);
+ String[] l = url.list();
+ if (l == null) {
+ // this can only happen if there is no connection or the directory does not exist
+ log.logInfo("directory listing not available. URL = " + request.url().toString());
+ sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "directory listing not available. URL = " + request.url().toString());
+ throw new IOException("directory listing not available. URL = " + request.url().toString());
+ }
+ for (String s: l) list.add(u + s);
+
+ StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true);
+
+ ResponseHeader responseHeader = new ResponseHeader();
+ responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date()));
+ responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html");
+ Response response = new Response(
+ request,
+ requestHeader,
+ responseHeader,
+ "OK",
+ sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
+ content.toString().getBytes());
+
+ return response;
+ }
+
+ // check mime type and availability of parsers
+ String mime = MimeTable.ext2mime(url.getFileExtension());
+ String parserError = null;
+ if (acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) {
+ // we know that we cannot process that file before loading
+ log.logInfo("no parser available (" + parserError + ") for url = " + request.url().toString());
+ sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "no parser available (" + parserError + ") for url = " + request.url().toString());
+ throw new IOException("no parser available (" + parserError + ") for url = " + request.url().toString());
+ }
+
+ // check resource size
+ long size = url.length();
+ if (size > maxFileSize && maxFileSize >= 0) {
+ log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString());
+ sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded");
+ throw new IOException("file size = " + size + " exceeds limit");
+ }
+
+ // load the resource
+ InputStream is = url.getInputStream();
+ byte[] b = FileUtils.read(is);
+ is.close();
+
+ // create response object
+ ResponseHeader responseHeader = new ResponseHeader();
+ responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified())));
+ responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);
+ Response response = new Response(
+ request,
+ requestHeader,
+ responseHeader,
+ "OK",
+ sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()),
+ b);
+ return response;
+ }
+
+}
diff --git a/source/de/anomic/net/ftpc.java b/source/de/anomic/net/ftpc.java
index ba8eca043..48d710206 100644
--- a/source/de/anomic/net/ftpc.java
+++ b/source/de/anomic/net/ftpc.java
@@ -1113,7 +1113,7 @@ public class ftpc {
* @param line
* @return null if not parseable
*/
- private entryInfo parseListData(final String line) {
+ private static entryInfo parseListData(final String line) {
final Pattern lsStyle = Pattern
.compile("^([-\\w]{10}).\\s*\\d+\\s+[-\\w]+\\s+[-\\w]+\\s+(\\d+)\\s+(\\w{3})\\s+(\\d+)\\s+(\\d+:?\\d*)\\s+(.*)$");
// groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6:
@@ -1125,7 +1125,7 @@ public class ftpc {
try {
size = Integer.parseInt(tokens.group(2));
} catch (final NumberFormatException e) {
- errPrintln("Error: not a number in list-entry: " + e.getMessage());
+ Log.logWarning("FTPC", "Error: not a number in list-entry: " + e.getMessage());
return null;
}
String time;
@@ -1148,7 +1148,7 @@ public class ftpc {
date = lsDateFormat.parse(dateString);
}
} catch (final ParseException e) {
- errPrintln(logPrefix + "---- Error: not ls date-format '" + dateString + "': " + e.getMessage());
+ Log.logWarning("FTPC", "---- Error: not ls date-format '" + dateString + "': " + e.getMessage());
date = new Date();
}
return new entryInfo(isDir, size, date, tokens.group(6));
@@ -2594,7 +2594,7 @@ public class ftpc {
public StringBuilder dirhtml(String remotePath) {
// returns a directory listing using an existing connection
try {
- if(isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) {
+ if (isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) {
remotePath += '/';
}
final List list = list(remotePath, true);
@@ -2605,7 +2605,7 @@ public class ftpc {
+ host + ((port == 21) ? "" : (":" + port)) + ((remotePath.length() > 0 && remotePath.charAt(0) == '/') ? "" : pwd() + "/")
+ remotePath;
- return dirhtml(base, remotemessage, remotegreeting, remotesystem, list);
+ return dirhtml(base, remotemessage, remotegreeting, remotesystem, list, true);
} catch (final java.security.AccessControlException e) {
return null;
} catch (final IOException e) {
@@ -2613,7 +2613,8 @@ public class ftpc {
}
}
- public static StringBuilder dirhtml(final String host, final int port, final String remotePath,
+ public static StringBuilder dirhtml(
+ final String host, final int port, final String remotePath,
final String account, final String password) {
// opens a new connection and returns a directory listing as html
try {
@@ -2631,8 +2632,10 @@ public class ftpc {
}
}
- public StringBuilder dirhtml(final String base, final String servermessage, final String greeting,
- final String system, final List list) {
+ public static StringBuilder dirhtml(
+ final String base, final String servermessage, final String greeting,
+ final String system, final List list,
+ final boolean metaRobotNoindex) {
// this creates the html output from collected strings
final StringBuilder page = new StringBuilder(1024);
final String title = "Index of " + base;
@@ -2641,14 +2644,19 @@ public class ftpc {
page.append("\n");
page.append(" " + title + "\n");
page.append(" \n");
+ if (metaRobotNoindex) {
+ page.append(" \n");
+ }
page.append(" \n");
page.append("\n");
page.append(" " + title + "
\n");
- page.append(" Server \"" + servermessage + "\" responded:\n");
- page.append(" \n");
- page.append(greeting);
- page.append("\n");
- page.append("
\n");
+ if (servermessage != null && greeting != null) {
+ page.append(" Server \"" + servermessage + "\" responded:\n");
+ page.append(" \n");
+ page.append(greeting);
+ page.append("\n");
+ page.append("
\n");
+ }
page.append("
\n");
page.append(" \n");
int nameStart, nameEnd;
@@ -2661,7 +2669,7 @@ public class ftpc {
page.append(line.substring(0, nameStart));
page.append("" + info.name + "");
nameEnd = nameStart + info.name.length();
- if(line.length() > nameEnd) {
+ if (line.length() > nameEnd) {
page.append(line.substring(nameEnd));
}
} else {
@@ -2672,7 +2680,7 @@ public class ftpc {
}
page.append("
\n");
page.append("
\n");
- page.append(" System info: \"" + system + "\"
\n");
+ if (system != null) page.append(" System info: \"" + system + "\"
\n");
page.append("\n");
return page;
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index cf4675a49..fdc3d550a 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -52,6 +52,7 @@ import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
+import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.http.client.Cache;
import de.anomic.http.client.Client;
import de.anomic.http.server.HeaderFramework;
@@ -70,16 +71,18 @@ public final class LoaderDispatcher {
private final HashSet supportedProtocols;
private final HTTPLoader httpLoader;
private final FTPLoader ftpLoader;
+ private final SMBLoader smbLoader;
private final Log log;
public LoaderDispatcher(final Switchboard sb) {
this.sb = sb;
- this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"}));
+ this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp","smb"}));
// initiate loader objects
this.log = new Log("LOADER");
httpLoader = new HTTPLoader(sb, log);
ftpLoader = new FTPLoader(sb, log);
+ smbLoader = new SMBLoader(sb, log);
}
public boolean isSupportedProtocol(final String protocol) {
@@ -254,6 +257,7 @@ public final class LoaderDispatcher {
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
if (protocol.equals("ftp")) response = ftpLoader.load(request);
+ if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (response != null) {
// we got something. Now check if we want to store that to the cache
String storeError = response.shallStoreCacheForCrawler();