From 9623d9e6d28aee8b169e3ce14075269e46833675 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 10 Mar 2010 08:55:29 +0000 Subject: [PATCH] added a smb loader component for the YaCy crawler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6737 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 3 + htroot/SettingsAck_p.html | 11 +- htroot/SettingsAck_p.java | 18 ++- htroot/Settings_p.java | 1 + .../anomic/crawler/retrieval/FTPLoader.java | 2 +- .../anomic/crawler/retrieval/SMBLoader.java | 139 ++++++++++++++++++ source/de/anomic/net/ftpc.java | 38 +++-- .../net/yacy/repository/LoaderDispatcher.java | 6 +- 8 files changed, 197 insertions(+), 21 deletions(-) create mode 100644 source/de/anomic/crawler/retrieval/SMBLoader.java diff --git a/defaults/yacy.init b/defaults/yacy.init index b25073ab4..1ab3c95d4 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -684,6 +684,9 @@ crawler.http.maxFileSize=1048576 # ftp crawler specific settings; size in bytes crawler.ftp.maxFileSize=1048576 +# smb crawler specific settings: maximum size +crawler.smb.maxFileSize=50000000 + # maximum number of crawler threads crawler.MaxActiveThreads = 200 diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html index db013c05a..4be5bcb58 100644 --- a/htroot/SettingsAck_p.html +++ b/htroot/SettingsAck_p.html @@ -156,19 +156,26 @@ #[crawler.clientTimeout]# - Http Crawler Settings: + http Crawler Settings: Maximum Filesize: #[crawler.http.maxFileSize]# - Ftp Crawler Settings: + ftp Crawler Settings: Maximum Filesize: #[crawler.ftp.maxFileSize]# + + smb Crawler Settings: + + + Maximum Filesize: + #[crawler.smb.maxFileSize]# + ::

Invalid crawler timeout value: #[crawler.clientTimeout]#

diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index aa2a50764..dbd390221 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -467,7 +467,7 @@ public class SettingsAck_p { // get maximum http file size String maxSizeStr = post.get("crawler.http.maxFileSize"); - if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; long maxHttpSize; try { @@ -484,7 +484,7 @@ public class SettingsAck_p { // get maximum ftp file size maxSizeStr = post.get("crawler.ftp.maxFileSize"); - if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; long maxFtpSize; try { @@ -496,10 +496,24 @@ public class SettingsAck_p { return prop; } + maxSizeStr = post.get("crawler.smb.maxFileSize"); + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; + + long maxSmbSize; + try { + maxSmbSize = Integer.parseInt(maxSizeStr); + env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize)); + } catch (final NumberFormatException e) { + prop.put("info", "31"); + prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize")); + return prop; + } + // everything is ok prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout)); prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize)); prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize)); + prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize)); prop.put("info", "28"); return prop; } diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 87c832040..1c2998d95 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -201,6 +201,7 @@ public final class Settings_p { prop.putHTML("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000")); prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1")); prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1")); + prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1")); // return rewrite properties return prop; diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 1b7adc4ca..565ad5e0c 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -54,7 +54,7 @@ public class FTPLoader { public FTPLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); + this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } /** diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java new file mode 100644 index 000000000..a6d242636 --- /dev/null +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -0,0 +1,139 @@ +// SMBLoader.java +// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 10.03.2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based search engine +// +// $LastChangedDate: 2010-03-07 00:41:51 +0100 (So, 07 Mrz 2010) $ +// $LastChangedRevision: 6719 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.crawler.retrieval; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import de.anomic.http.server.HeaderFramework; +import de.anomic.http.server.RequestHeader; +import de.anomic.http.server.ResponseHeader; +import de.anomic.net.ftpc; +import de.anomic.search.Segments; +import de.anomic.search.Switchboard; +import de.anomic.data.MimeTable; + +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.DateFormatter; +import net.yacy.kelondro.util.FileUtils; + +public class SMBLoader { + + private final Switchboard sb; + private final Log log; + private final int maxFileSize; + + public SMBLoader(final Switchboard sb, final Log log) { + this.sb = sb; + this.log = log; + maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l); + } + + + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { + DigestURI url = request.url(); + if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol()); + + RequestHeader requestHeader = new RequestHeader(); + if (request.referrerhash() != null) { + DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); + } + + + if (url.isDirectory()) { + List list = new ArrayList(); + String u = url.toNormalform(true, true); + String[] l = url.list(); + if (l == null) { + // this can only happen if there is no connection or the directory does not exist + log.logInfo("directory listing not available. URL = " + request.url().toString()); + sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "directory listing not available. URL = " + request.url().toString()); + throw new IOException("directory listing not available. URL = " + request.url().toString()); + } + for (String s: l) list.add(u + s); + + StringBuilder content = ftpc.dirhtml(u, null, null, null, list, true); + + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date())); + responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html"); + Response response = new Response( + request, + requestHeader, + responseHeader, + "OK", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + content.toString().getBytes()); + + return response; + } + + // check mime type and availability of parsers + String mime = MimeTable.ext2mime(url.getFileExtension()); + String parserError = null; + if (acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) { + // we know that we cannot process that file before loading + log.logInfo("no parser available (" + parserError + ") for url = " + request.url().toString()); + sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "no parser available (" + parserError + ") for url = " + request.url().toString()); + throw new IOException("no parser available (" + parserError + ") for url = " + request.url().toString()); + } + + // check resource size + long size = url.length(); + if (size > maxFileSize && maxFileSize >= 0) { + log.logInfo("REJECTED TOO BIG FILE with size " + size + " Bytes for URL " + request.url().toString()); + sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash, new Date(), 1, "file size limit exceeded"); + throw new IOException("file size = " + size + " exceeds limit"); + } + + // load the resource + InputStream is = url.getInputStream(); + byte[] b = FileUtils.read(is); + is.close(); + + // create response object + ResponseHeader responseHeader = new ResponseHeader(); + responseHeader.put(HeaderFramework.LAST_MODIFIED, DateFormatter.formatRFC1123(new Date(url.lastModified()))); + responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); + Response response = new Response( + request, + requestHeader, + responseHeader, + "OK", + sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), + b); + return response; + } + +} diff --git a/source/de/anomic/net/ftpc.java b/source/de/anomic/net/ftpc.java index ba8eca043..48d710206 100644 --- a/source/de/anomic/net/ftpc.java +++ b/source/de/anomic/net/ftpc.java @@ -1113,7 +1113,7 @@ public class ftpc { * @param line * @return null if not parseable */ - private entryInfo parseListData(final String line) { + private static entryInfo parseListData(final String line) { final Pattern lsStyle = Pattern .compile("^([-\\w]{10}).\\s*\\d+\\s+[-\\w]+\\s+[-\\w]+\\s+(\\d+)\\s+(\\w{3})\\s+(\\d+)\\s+(\\d+:?\\d*)\\s+(.*)$"); // groups: 1: rights, 2: size, 3: month, 4: day, 5: time or year, 6: @@ -1125,7 +1125,7 @@ public class ftpc { try { size = Integer.parseInt(tokens.group(2)); } catch (final NumberFormatException e) { - errPrintln("Error: not a number in list-entry: " + e.getMessage()); + Log.logWarning("FTPC", "Error: not a number in list-entry: " + e.getMessage()); return null; } String time; @@ -1148,7 +1148,7 @@ public class ftpc { date = lsDateFormat.parse(dateString); } } catch (final ParseException e) { - errPrintln(logPrefix + "---- Error: not ls date-format '" + dateString + "': " + e.getMessage()); + Log.logWarning("FTPC", "---- Error: not ls date-format '" + dateString + "': " + e.getMessage()); date = new Date(); } return new entryInfo(isDir, size, date, tokens.group(6)); @@ -2594,7 +2594,7 @@ public class ftpc { public StringBuilder dirhtml(String remotePath) { // returns a directory listing using an existing connection try { - if(isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) { + if (isFolder(remotePath) && '/' != remotePath.charAt(remotePath.length()-1)) { remotePath += '/'; } final List list = list(remotePath, true); @@ -2605,7 +2605,7 @@ public class ftpc { + host + ((port == 21) ? "" : (":" + port)) + ((remotePath.length() > 0 && remotePath.charAt(0) == '/') ? "" : pwd() + "/") + remotePath; - return dirhtml(base, remotemessage, remotegreeting, remotesystem, list); + return dirhtml(base, remotemessage, remotegreeting, remotesystem, list, true); } catch (final java.security.AccessControlException e) { return null; } catch (final IOException e) { @@ -2613,7 +2613,8 @@ public class ftpc { } } - public static StringBuilder dirhtml(final String host, final int port, final String remotePath, + public static StringBuilder dirhtml( + final String host, final int port, final String remotePath, final String account, final String password) { // opens a new connection and returns a directory listing as html try { @@ -2631,8 +2632,10 @@ public class ftpc { } } - public StringBuilder dirhtml(final String base, final String servermessage, final String greeting, - final String system, final List list) { + public static StringBuilder dirhtml( + final String base, final String servermessage, final String greeting, + final String system, final List list, + final boolean metaRobotNoindex) { // this creates the html output from collected strings final StringBuilder page = new StringBuilder(1024); final String title = "Index of " + base; @@ -2641,14 +2644,19 @@ public class ftpc { page.append("\n"); page.append(" " + title + "\n"); page.append(" \n"); + if (metaRobotNoindex) { + page.append(" \n"); + } page.append(" \n"); page.append("\n"); page.append("

" + title + "

\n"); - page.append("

Server \"" + servermessage + "\" responded:\n");
-        page.append("  \n");
-        page.append(greeting);
-        page.append("\n");
-        page.append("  

\n"); + if (servermessage != null && greeting != null) { + page.append("

Server \"" + servermessage + "\" responded:\n");
+            page.append("  \n");
+            page.append(greeting);
+            page.append("\n");
+            page.append("  

\n"); + } page.append("
\n"); page.append("
\n");
         int nameStart, nameEnd;
@@ -2661,7 +2669,7 @@ public class ftpc {
                 page.append(line.substring(0, nameStart));
                 page.append("" + info.name + "");
                 nameEnd = nameStart + info.name.length();
-                if(line.length() > nameEnd) {
+                if (line.length() > nameEnd) {
                     page.append(line.substring(nameEnd));
                 }
             } else {
@@ -2672,7 +2680,7 @@ public class ftpc {
         }
         page.append("  
\n"); page.append("
\n"); - page.append("
System info: \"" + system + "\"
\n"); + if (system != null) page.append("
System info: \"" + system + "\"
\n"); page.append("\n"); return page; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index cf4675a49..fdc3d550a 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -52,6 +52,7 @@ import de.anomic.crawler.retrieval.FTPLoader; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; +import de.anomic.crawler.retrieval.SMBLoader; import de.anomic.http.client.Cache; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; @@ -70,16 +71,18 @@ public final class LoaderDispatcher { private final HashSet supportedProtocols; private final HTTPLoader httpLoader; private final FTPLoader ftpLoader; + private final SMBLoader smbLoader; private final Log log; public LoaderDispatcher(final Switchboard sb) { this.sb = sb; - this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp"})); + this.supportedProtocols = new HashSet(Arrays.asList(new String[]{"http","https","ftp","smb"})); // initiate loader objects this.log = new Log("LOADER"); httpLoader = new HTTPLoader(sb, log); ftpLoader = new FTPLoader(sb, log); + smbLoader = new SMBLoader(sb, log); } public boolean isSupportedProtocol(final String protocol) { @@ -254,6 +257,7 @@ public final class LoaderDispatcher { Response response = null; if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable); if (protocol.equals("ftp")) response = ftpLoader.load(request); + if (protocol.equals("smb")) response = smbLoader.load(request, true); if (response != null) { // we got something. Now check if we want to store that to the cache String storeError = response.shallStoreCacheForCrawler();