*) Adding settings page for the crawler which allows to specify a file size limit and the timeout to use.

*) adding first version of maximum filesize check for the crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2534 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent ecbce206b7
commit 63893003be

@ -182,6 +182,31 @@ This values seems not to be a valid port configuration.
<b>Error in pattern nr #[nr]#</b> "<tt>#[pattern]#</tt>": #[error]#</tt>
</font>
</p>
::<!-- 28 : Crawler settings OK -->
<p><b>Your crawler settings have been changed.</b></p>
<table>
<tr><td colspan="2"><p><b>Generic Settings:</b></p></td></tr>
<tr>
<td>Crawler timeout:</td>
<td class="settingsValue">#[crawler.clientTimeout]#</td>
</tr>
<tr><td colspan="2"><p><b>Http Crawler Settings:</b></p></td></tr>
<tr>
<td>Maximum Filesize:</td>
<td class="settingsValue">#[crawler.http.maxFileSize]#</td>
</tr>
<tr><td colspan="2"><p><b>Ftp Crawler Settings:</b></p></td></tr>
<tr>
<td>Maximum Filesize:</td>
<td class="settingsValue">#[crawler.ftp.maxFileSize]#</td>
</tr>
</table>
::<!-- 29: Crawler settings timeout error -->
<p><font color="red"><b>Invalid crawler timeout value:</b> <tt>#[crawler.clientTimeout]#</tt></font></p>
::<!-- 30: Cralwer settings, maxHttpFileSize error -->
<p><font color="red"><b>Invalid maximum file size for http crawler:</b> <tt>#[crawler.http.maxFileSize]#</tt></font></p>
::<!-- 31: Cralwer settings, maxFtpFileSize error -->
<p><font color="red"><b>Invalid maximum file size for ftp crawler:</b> <tt>#[crawler.http.maxFileSize]#</tt></font></p>
#(/info)#
</p>
<p>You can now go back to the <a href="Settings_p.html">Settings</a> page if you want to make more changes.</p>

@ -683,6 +683,56 @@ public class SettingsAck_p {
// return prop;
}
// Crawler settings
if (post.containsKey("crawlerSettings")) {
// getting Crawler Timeout
String timeoutStr = (String) post.get("crawler.clientTimeout");
if (timeoutStr==null||timeoutStr.length()==0) timeoutStr = "10000";
try {
int timeout = Integer.valueOf(timeoutStr).intValue();
env.setConfig("crawler.clientTimeout", Integer.toString(timeout));
} catch (NumberFormatException e) {
prop.put("info", 29);
prop.put("info_crawler.clientTimeout",post.get("crawler.clientTimeout"));
return prop;
}
// getting maximum http file size
String maxSizeStr = (String) post.get("crawler.http.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
try {
long maxSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.http.maxFileSize", Long.toString(maxSize));
} catch (NumberFormatException e) {
prop.put("info", 30);
prop.put("info_crawler.http.maxFileSize",post.get("crawler.http.maxFileSize"));
return prop;
}
// getting maximum ftp file size
maxSizeStr = (String) post.get("crawler.ftp.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
try {
long maxSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.ftp.maxFileSize", Long.toString(maxSize));
} catch (NumberFormatException e) {
prop.put("info", 31);
prop.put("info_crawler.ftp.maxFileSize",post.get("crawler.ftp.maxFileSize"));
return prop;
}
// everything is ok
prop.put("info_crawler.clientTimeout",post.get("crawler.clientTimeout"));
prop.put("info_crawler.http.maxFileSize",post.get("crawler.http.maxFileSize"));
prop.put("info_crawler.ftp.maxFileSize",post.get("crawler.ftp.maxFileSize"));
prop.put("info", 28);
return prop;
}
// nothing made
prop.put("info", 1);//no information submitted

@ -0,0 +1,37 @@
<p><form action="SettingsAck_p.html" method="post" enctype="multipart/form-data">
<fieldset><legend id="admin">Crawler Settings</legend>
<table border="0" cellspacing="5" width="100%">
<tr><td colspan="3"><p><b>Generic Crawler Settings:</b></p></td></tr>
<tr valign="top">
<td>Timeout:</td>
<td><input name="crawler.clientTimeout" type="text" size="16" maxlength="16" value="#[crawler.clientTimeout]#"></td>
<td width="100%"><i>Connection timeout in ms that should be used. <code>0</code> means unlimited.</i></td>
</tr>
<tr><td colspan="3"><hr></td></tr>
<tr><td colspan="3"><p><b>http Crawler Settings:</b></p></td></tr>
<tr valign="top">
<td>Maximum&nbsp;Filesize:</td>
<td><input name="crawler.http.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.http.maxFileSize]#"></td>
<td><i>Maximum allowed file size in bytes that should be downloaded. Oversized files will be skipped. <code>-1</code> means unlimited.<br>
Please note that if the crawler uses content compression, this limit is used to check the compressed content size.</i></td>
</tr>
<tr><td colspan="3"><hr></td></tr>
<tr><td colspan="3"><p><b>ftp Crawler Settings:</b></p></td></tr>
<tr valign="top">
<td>Maximum&nbsp;Filesize:</td>
<td><input name="crawler.ftp.maxFileSize" type="text" size="16" maxlength="16" value="#[crawler.ftp.maxFileSize]#"></td>
<td><i>Maximum allowed file size in bytes that should be downloaded. Oversized files will be skipped. <code>-1</code> means unlimited.</i></td>
</tr>
<tr><td colspan="3"><hr></td></tr>
<tr valign="top">
<td>&nbsp;</td>
<td><input type="submit" name="crawlerSettings" value="submit"></td>
<td><i>Changes will take effect immediately.</i></td>
</tr>
</table>
</fieldset>
</form><br>

@ -32,7 +32,7 @@ delete the file 'DATA/SETTINGS/httpProxy.conf' in the YaCy application root fold
<table>
<tr>
<td><img src="env/grafics/down.gif" alt="down"><a href="?page=admin">Administration Account Settings</a></td>
<td rowspan="5" width="30">&nbsp;</td>
<td rowspan="6" width="30">&nbsp;</td>
<td><img src="env/grafics/down.gif" alt="down"><a href="?page=ServerAccess">Server Access Settings</a></td>
</tr>
<tr>
@ -53,7 +53,7 @@ delete the file 'DATA/SETTINGS/httpProxy.conf' in the YaCy application root fold
</tr>
<tr>
<td><img src="env/grafics/down.gif" alt="down"><a href="?page=portForwarding">Port Forwarding (optional)</a></td>
<td>&nbsp;</td>
<td><img src="env/grafics/down.gif" alt="down"><a href="?page=crawler">Crawler Settings</a></td>
</tr>
</table>

@ -101,6 +101,9 @@ public final class Settings_p {
else if (page.equals("parser")) {
prop.put("settingsTables", "Settings_Parser.inc");
}
else if (page.equals("crawler")) {
prop.put("settingsTables", "Settings_Crawler.inc");
}
else {
prop.put("settingsTables", "Settings_Admin.inc");
}
@ -328,6 +331,11 @@ public final class Settings_p {
prop.put("parser", parserIdx);
prop.put("parser.colspan", Integer.toString(configArray.length+3));
// Crawler settings
prop.put("crawler.clientTimeout",sb.getConfig("crawler.clientTimeout", "10000"));
prop.put("crawler.http.maxFileSize",sb.getConfig("crawler.http.maxFileSize", "-1"));
prop.put("crawler.ftp.maxFileSize",sb.getConfig("crawler.ftp.maxFileSize", "-1"));
// return rewrite properties
return prop;
}

@ -1669,6 +1669,7 @@ do upload
public int statusCode = 503;
public String statusText = "internal error";
private boolean gzip; // for gunzipping on-the-fly
private long gzippedLength = -1; // reported content length if content-encoding is set
/**
* Constructor for this class. Reads in the content for the given outer
@ -1759,10 +1760,21 @@ do upload
this.gzip = ((zipped) && (this.responseHeader.gzip()));
if (this.gzip) {
if (this.responseHeader.containsKey(httpHeader.CONTENT_LENGTH)) {
this.gzippedLength = this.responseHeader.contentLength();
}
this.responseHeader.remove(httpHeader.CONTENT_ENCODING); // we fake that we don't have encoding, since what comes out does not have gzip and we also don't know what was encoded
this.responseHeader.remove(httpHeader.CONTENT_LENGTH); // we cannot use the length during gunzippig yet; still we can hope that it works
}
}
public long getGzippedLength() {
return this.gzippedLength;
}
public boolean isGzipped() {
return this.gzip;
}
/**
* Converts an instance of this class into a readable string.

@ -81,6 +81,11 @@ public final class CrawlWorker extends AbstractCrawlWorker {
*/
private int socketTimeout;
/**
* The maximum allowed file size
*/
private long maxFileSize = -1;
/**
* The remote http proxy that should be used
*/
@ -118,6 +123,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
this.socketTimeout = this.theMsg.timeout;
}
// maximum allowed file size
this.maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", -1);
// some http header values
this.acceptEncoding = this.sb.getConfig("crawler.http.acceptEncoding", "gzip,deflate");
this.acceptLanguage = this.sb.getConfig("crawler.http.acceptLanguage","en-us,en;q=0.5");
@ -198,6 +206,22 @@ public final class CrawlWorker extends AbstractCrawlWorker {
if (res.status.startsWith("200") || res.status.startsWith("203")) {
// the transfer is ok
// check the maximum allowed file size
if (this.maxFileSize > -1) {
long contentLength = (res.isGzipped()) ? res.getGzippedLength() : res.responseHeader.contentLength();
if (contentLength == -1) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url + " because of unknown file size. Max filesize limit can not be checked.");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_FILESIZE_UNKNOWN);
return null;
} else if (contentLength > this.maxFileSize) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url + " because file size '" + contentLength + "' exceeds max filesize limit.");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
return null;
}
}
// create a new cache entry
htCache = createCacheEntry(this.url,requestDate, requestHeader, res);

@ -91,6 +91,8 @@ public class plasmaCrawlEURL extends indexURL {
public static final String DENIED_REDIRECTION_COUNTER_EXCEEDED = "denied_(redirection_counter_exceeded)";
public static final String DENIED_WRONG_HTTP_STATUSCODE = "denied_(wrong_http_status_code_";
public static final String DENIED_CONTENT_DECODING_ERROR = "denied_(content_decoding_error)";
public static final String DENIED_FILESIZE_LIMIT_EXCEEDED = "denied_(filesize_limit_exceeded)";
public static final String DENIED_FILESIZE_UNKNOWN = "denied_(filesize_unknown)";
// network errors
public static final String DENIED_UNKNOWN_HOST = "denied_(unknown_host)";

@ -634,14 +634,17 @@ onlineCautionDelay=30000
crawler.clientTimeout=9000
# http crawler specific settings
crawler.http.acceptEncoding=gzip,deflate
crawler.http.acceptEncoding=gzip
crawler.http.acceptLanguage=en-us,en;q=0.5
crawler.http.acceptCharset=ISO-8859-1,utf-8;q=0.7,*;q=0.7
crawler.http.maxFileSize=-1
# ftp crawler specific settings
crawler.ftp.maxFileSize=-1
# maximum number of crawler threads
crawler.MaxActiveThreads = 10
crawler.MaxIdleThreads = 7
crawler.MinIdleThreads = 5
# maximum number of crawl-stacker threads
stacker.MaxActiveThreads = 50

Loading…
Cancel
Save