diff --git a/defaults/yacy.init b/defaults/yacy.init index b25073ab4..1ab3c95d4 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -684,6 +684,9 @@ crawler.http.maxFileSize=1048576 # ftp crawler specific settings; size in bytes crawler.ftp.maxFileSize=1048576 +# smb crawler specific settings: maximum size +crawler.smb.maxFileSize=50000000 + # maximum number of crawler threads crawler.MaxActiveThreads = 200 diff --git a/htroot/SettingsAck_p.html b/htroot/SettingsAck_p.html index db013c05a..4be5bcb58 100644 --- a/htroot/SettingsAck_p.html +++ b/htroot/SettingsAck_p.html @@ -156,19 +156,26 @@
Invalid crawler timeout value: #[crawler.clientTimeout]#
diff --git a/htroot/SettingsAck_p.java b/htroot/SettingsAck_p.java index aa2a50764..dbd390221 100644 --- a/htroot/SettingsAck_p.java +++ b/htroot/SettingsAck_p.java @@ -467,7 +467,7 @@ public class SettingsAck_p { // get maximum http file size String maxSizeStr = post.get("crawler.http.maxFileSize"); - if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; long maxHttpSize; try { @@ -484,7 +484,7 @@ public class SettingsAck_p { // get maximum ftp file size maxSizeStr = post.get("crawler.ftp.maxFileSize"); - if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1"; + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; long maxFtpSize; try { @@ -496,10 +496,24 @@ public class SettingsAck_p { return prop; } + maxSizeStr = post.get("crawler.smb.maxFileSize"); + if (maxSizeStr==null||maxSizeStr.length()==0) maxSizeStr = "-1"; + + long maxSmbSize; + try { + maxSmbSize = Integer.parseInt(maxSizeStr); + env.setConfig("crawler.smb.maxFileSize", Long.toString(maxFtpSize)); + } catch (final NumberFormatException e) { + prop.put("info", "31"); + prop.putHTML("info_crawler.smb.maxFileSize",post.get("crawler.smb.maxFileSize")); + return prop; + } + // everything is ok prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ? "0" :DateFormatter.formatInterval(crawlerTimeout)); prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)? "-1":Formatter.bytesToString(maxHttpSize)); prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxFtpSize)); + prop.put("info_crawler.smb.maxFileSize", (maxFtpSize==-1) ? "-1":Formatter.bytesToString(maxSmbSize)); prop.put("info", "28"); return prop; } diff --git a/htroot/Settings_p.java b/htroot/Settings_p.java index 87c832040..1c2998d95 100644 --- a/htroot/Settings_p.java +++ b/htroot/Settings_p.java @@ -201,6 +201,7 @@ public final class Settings_p { prop.putHTML("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000")); prop.putHTML("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1")); prop.putHTML("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1")); + prop.putHTML("crawler.smb.maxFileSize",sb.getConfig("crawler.smb.maxFileSize", "-1")); // return rewrite properties return prop; diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 1b7adc4ca..565ad5e0c 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -54,7 +54,7 @@ public class FTPLoader { public FTPLoader(final Switchboard sb, final Log log) { this.sb = sb; this.log = log; - maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); + this.maxFileSize = (int) sb.getConfigLong("crawler.ftp.maxFileSize", -1l); } /** diff --git a/source/de/anomic/crawler/retrieval/SMBLoader.java b/source/de/anomic/crawler/retrieval/SMBLoader.java new file mode 100644 index 000000000..a6d242636 --- /dev/null +++ b/source/de/anomic/crawler/retrieval/SMBLoader.java @@ -0,0 +1,139 @@ +// SMBLoader.java +// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 10.03.2010 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based search engine +// +// $LastChangedDate: 2010-03-07 00:41:51 +0100 (So, 07 Mrz 2010) $ +// $LastChangedRevision: 6719 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +package de.anomic.crawler.retrieval; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; + +import de.anomic.http.server.HeaderFramework; +import de.anomic.http.server.RequestHeader; +import de.anomic.http.server.ResponseHeader; +import de.anomic.net.ftpc; +import de.anomic.search.Segments; +import de.anomic.search.Switchboard; +import de.anomic.data.MimeTable; + +import net.yacy.document.TextParser; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.util.DateFormatter; +import net.yacy.kelondro.util.FileUtils; + +public class SMBLoader { + + private final Switchboard sb; + private final Log log; + private final int maxFileSize; + + public SMBLoader(final Switchboard sb, final Log log) { + this.sb = sb; + this.log = log; + maxFileSize = (int) sb.getConfigLong("crawler.smb.maxFileSize", -1l); + } + + + public Response load(final Request request, boolean acceptOnlyParseable) throws IOException { + DigestURI url = request.url(); + if (!url.getProtocol().equals("smb")) throw new IOException("wrong loader for SMBLoader: " + url.getProtocol()); + + RequestHeader requestHeader = new RequestHeader(); + if (request.referrerhash() != null) { + DigestURI ur = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash()); + if (ur != null) requestHeader.put(RequestHeader.REFERER, ur.toNormalform(true, false)); + } + + + if (url.isDirectory()) { + ListServer \"" + servermessage + "\" responded:\n"); - page.append(" \n"); - page.append(greeting); - page.append("\n"); - page.append("\n"); + if (servermessage != null && greeting != null) { + page.append("
Server \"" + servermessage + "\" responded:\n"); + page.append(" \n"); + page.append(greeting); + page.append("\n"); + page.append("\n"); + } page.append("
\n"); int nameStart, nameEnd; @@ -2661,7 +2669,7 @@ public class ftpc { page.append(line.substring(0, nameStart)); page.append("" + info.name + ""); nameEnd = nameStart + info.name.length(); - if(line.length() > nameEnd) { + if (line.length() > nameEnd) { page.append(line.substring(nameEnd)); } } else { @@ -2672,7 +2680,7 @@ public class ftpc { } page.append("\n"); page.append("
System info: \"" + system + "\"\n"); + if (system != null) page.append("
System info: \"" + system + "\"\n"); page.append("\n"); return page; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index cf4675a49..fdc3d550a 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -52,6 +52,7 @@ import de.anomic.crawler.retrieval.FTPLoader; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; +import de.anomic.crawler.retrieval.SMBLoader; import de.anomic.http.client.Cache; import de.anomic.http.client.Client; import de.anomic.http.server.HeaderFramework; @@ -70,16 +71,18 @@ public final class LoaderDispatcher { private final HashSet