*) better handling of maximum file size limit in crawler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2543 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 18 years ago
parent 416b4e5c6b
commit fded1f4a5d

@ -67,6 +67,8 @@ import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverDate;
import de.anomic.server.serverMemory;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.serverThread;
@ -690,9 +692,11 @@ public class SettingsAck_p {
String timeoutStr = (String) post.get("crawler.clientTimeout");
if (timeoutStr==null||timeoutStr.length()==0) timeoutStr = "10000";
int crawlerTimeout;
try {
int timeout = Integer.valueOf(timeoutStr).intValue();
env.setConfig("crawler.clientTimeout", Integer.toString(timeout));
crawlerTimeout = Integer.valueOf(timeoutStr).intValue();
if (crawlerTimeout < 0) crawlerTimeout = 0;
env.setConfig("crawler.clientTimeout", Integer.toString(crawlerTimeout));
} catch (NumberFormatException e) {
prop.put("info", 29);
prop.put("info_crawler.clientTimeout",post.get("crawler.clientTimeout"));
@ -703,9 +707,10 @@ public class SettingsAck_p {
String maxSizeStr = (String) post.get("crawler.http.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
long maxHttpSize;
try {
long maxSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.http.maxFileSize", Long.toString(maxSize));
maxHttpSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.http.maxFileSize", Long.toString(maxHttpSize));
} catch (NumberFormatException e) {
prop.put("info", 30);
prop.put("info_crawler.http.maxFileSize",post.get("crawler.http.maxFileSize"));
@ -716,9 +721,10 @@ public class SettingsAck_p {
maxSizeStr = (String) post.get("crawler.ftp.maxFileSize");
if (maxSizeStr==null||maxSizeStr.length()==0) timeoutStr = "-1";
long maxFtpSize;
try {
long maxSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.ftp.maxFileSize", Long.toString(maxSize));
maxFtpSize = Integer.valueOf(maxSizeStr).intValue();
env.setConfig("crawler.ftp.maxFileSize", Long.toString(maxFtpSize));
} catch (NumberFormatException e) {
prop.put("info", 31);
prop.put("info_crawler.ftp.maxFileSize",post.get("crawler.ftp.maxFileSize"));
@ -726,9 +732,9 @@ public class SettingsAck_p {
}
// everything is ok
prop.put("info_crawler.clientTimeout",post.get("crawler.clientTimeout"));
prop.put("info_crawler.http.maxFileSize",post.get("crawler.http.maxFileSize"));
prop.put("info_crawler.ftp.maxFileSize",post.get("crawler.ftp.maxFileSize"));
prop.put("info_crawler.clientTimeout",(crawlerTimeout==0) ?"0" :serverDate.intervalToString(crawlerTimeout));
prop.put("info_crawler.http.maxFileSize",(maxHttpSize==-1)?"-1":serverMemory.bytesToString(maxHttpSize));
prop.put("info_crawler.ftp.maxFileSize", (maxFtpSize==-1) ?"-1":serverMemory.bytesToString(maxFtpSize));
prop.put("info", 28);
return prop;
}

@ -0,0 +1,97 @@
//httpBoundedSizeOutputStream.java
//-----------------------
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
// This file is contributed by Martin Thelian
// last major change: $LastChangedDate: 2006-08-16 21:49:31 +0200 (Mi, 16 Aug 2006) $ by $LastChangedBy: orbiter $
// Revision: $LastChangedRevision: 2414 $
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.http;
import java.io.IOException;
import java.io.OutputStream;
public class httpdBoundedSizeOutputStream extends httpdByteCountOutputStream {
protected long maxSize = 0;
public httpdBoundedSizeOutputStream(OutputStream outputStream, long sizeLimit) {
this(outputStream,0,sizeLimit);
}
public httpdBoundedSizeOutputStream(OutputStream outputStream, long initByteCount, long sizeLimit) {
super(outputStream,initByteCount);
this.maxSize = sizeLimit;
}
public void write(byte[] b) throws IOException {
if (this.byteCount + b.length > this.maxSize) {
// write out the rest until we have reached the limit
long rest = this.maxSize-this.byteCount;
if (rest > 0) this.write(b, 0, (int)rest);
// throw an exception
throw new httpdLimitExceededException("Limit exceeded",this.maxSize);
}
super.write(b);
}
public void write(byte[] b, int off, int len) throws IOException {
if (this.byteCount + len > this.maxSize) {
// write out the rest until we reach the limit
long rest = this.maxSize-this.byteCount;
if (rest > 0) this.write(b, 0, (int)rest);
// throw an exception
throw new httpdLimitExceededException("Limit exceeded",this.maxSize);
}
super.write(b, off, len);
}
public void write(int b) throws IOException {
if (this.byteCount + 1 > this.maxSize) {
// throw an exception
throw new httpdLimitExceededException("Limit exceeded",this.maxSize);
}
super.write(b);
}
public long getSizeLimit() {
return this.maxSize;
}
}

@ -1,3 +1,47 @@
//httpByteCountinputStream.java
//-----------------------
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
// This file is contributed by Martin Thelian
// last major change: $LastChangedDate$ by $LastChangedBy$
// Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.http;
import java.io.FilterInputStream;

@ -1,23 +1,67 @@
//httpByteCountOutputStream.java
//-----------------------
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
// This file is contributed by Martin Thelian
// last major change: $LastChangedDate$ by $LastChangedBy$
// Revision: $LastChangedRevision$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.http;
import java.io.BufferedOutputStream;
import java.io.IOException;
import java.io.OutputStream;
public final class httpdByteCountOutputStream extends BufferedOutputStream {
public class httpdByteCountOutputStream extends BufferedOutputStream {
private static final Object syncObject = new Object();
private static long globalByteCount = 0;
private boolean finished = false;
private long byteCount;
protected long byteCount;
/**
* Constructor of this class
* @param outputStream the {@link OutputStream} to write to
*/
public httpdByteCountOutputStream(OutputStream outputStream) {
super(outputStream);
this(outputStream,0);
}
/**
@ -25,7 +69,7 @@ public final class httpdByteCountOutputStream extends BufferedOutputStream {
* @param outputStream the {@link OutputStream} to write to
* @param initByteCount to initialize the bytecount with a given value
*/
public httpdByteCountOutputStream(OutputStream outputStream, int initByteCount) {
public httpdByteCountOutputStream(OutputStream outputStream, long initByteCount) {
super(outputStream);
this.byteCount = initByteCount;
}

@ -0,0 +1,60 @@
//httpdLimitExceededException.java
//-----------------------
//(C) by Michael Peter Christen; mc@anomic.de
//first published on http://www.anomic.de
//Frankfurt, Germany, 2004
//
// This file is contributed by Martin Thelian
// last major change: $LastChangedDate: 2006-08-16 21:49:31 +0200 (Mi, 16 Aug 2006) $ by $LastChangedBy: orbiter $
// Revision: $LastChangedRevision: 2414 $
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
//Using this software in any meaning (reading, learning, copying, compiling,
//running) means that you agree that the Author(s) is (are) not responsible
//for cost, loss of data or any harm that may be caused directly or indirectly
//by usage of this softare or this documentation. The usage of this software
//is on your own risk. The installation and usage (starting/running) of this
//software may allow other people or application to access your computer and
//any attached devices and is highly dependent on the configuration of the
//software which must be done by the user of the software; the author(s) is
//(are) also not responsible for proper configuration and usage of the
//software, even if provoked by documentation provided together with
//the software.
//
//Any changes to this file according to the GPL as documented in the file
//gpl.txt aside this file in the shipment you received can be done to the
//lines that follows this copyright notice here, but changes must not be
//done inside the copyright notive above. A re-distribution must contain
//the intact and unchanged copyright notice.
//Contributions and changes to the program code must be marked as such.
package de.anomic.http;
import java.io.IOException;
public class httpdLimitExceededException extends IOException {
private long limit;
public httpdLimitExceededException(String errorMsg, long limit) {
super(errorMsg);
this.limit = limit;
}
public long getLimit() {
return this.limit;
}
}

@ -47,6 +47,7 @@ package de.anomic.plasma.crawler.http;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.NoRouteToHostException;
import java.net.SocketException;
@ -56,6 +57,8 @@ import java.util.Date;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.http.httpdBoundedSizeOutputStream;
import de.anomic.http.httpdLimitExceededException;
import de.anomic.http.httpdProxyHandler;
import de.anomic.index.indexURL;
import de.anomic.net.URL;
@ -206,22 +209,6 @@ public final class CrawlWorker extends AbstractCrawlWorker {
if (res.status.startsWith("200") || res.status.startsWith("203")) {
// the transfer is ok
// check the maximum allowed file size
if (this.maxFileSize > -1) {
long contentLength = (res.isGzipped()) ? res.getGzippedLength() : res.responseHeader.contentLength();
if (contentLength == -1) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url + " because of unknown file size. Max filesize limit can not be checked.");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_FILESIZE_UNKNOWN);
return null;
} else if (contentLength > this.maxFileSize) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url + " because file size '" + contentLength + "' exceeds max filesize limit.");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
return null;
}
}
// create a new cache entry
htCache = createCacheEntry(this.url,requestDate, requestHeader, res);
@ -248,18 +235,36 @@ public final class CrawlWorker extends AbstractCrawlWorker {
File cacheFile = this.cacheManager.getCachePath(this.url);
try {
if ((this.acceptAllContent) || (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,this.url,res.responseHeader.mime()))) {
// delete old content
if (cacheFile.isFile()) {
this.cacheManager.deleteFile(this.url);
}
// we write the new cache entry to file system directly
// create parent directories
cacheFile.getParentFile().mkdirs();
FileOutputStream fos = null;
OutputStream fos = null;
try {
fos = new FileOutputStream(cacheFile);
res.writeContent(fos); // superfluous write to array
// creating an output stream
fos = new FileOutputStream(cacheFile);
// check the maximum allowed file size
if (this.maxFileSize > -1) {
long contentLength = (res.isGzipped()) ? res.getGzippedLength() : res.responseHeader.contentLength();
if (contentLength == -1) {
fos = new httpdBoundedSizeOutputStream(fos,this.maxFileSize);
} else if (contentLength > this.maxFileSize) {
remote.close();
this.log.logInfo("REJECTED URL " + this.url + " because file size '" + contentLength + "' exceeds max filesize limit.");
addURLtoErrorDB(plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED);
return null;
}
}
// we write the new cache entry to file system directly
res.writeContent(fos);
htCache.setCacheArray(null);
this.cacheManager.writeFileAnnouncement(cacheFile);
//htCache.cacheArray = res.writeContent(fos); // writes in cacheArray and cache file
} finally {
if (fos!=null)try{fos.close();}catch(Exception e){/* ignore this */}
}
@ -372,6 +377,9 @@ public final class CrawlWorker extends AbstractCrawlWorker {
) {
this.log.logInfo("CRAWLER Interruption detected because of server shutdown.");
failreason = plasmaCrawlEURL.DENIED_SERVER_SHUTDOWN;
} else if (e instanceof httpdLimitExceededException) {
this.log.logWarning("CRAWLER Max file size limit '" + this.maxFileSize + "' exceeded while downloading URL " + this.url);
failreason = plasmaCrawlEURL.DENIED_FILESIZE_LIMIT_EXCEEDED;
} else if (e instanceof MalformedURLException) {
this.log.logWarning("CRAWLER Malformed URL '" + this.url.toString() + "' detected. ");
failreason = plasmaCrawlEURL.DENIED_MALFORMED_URL;
@ -380,58 +388,58 @@ public final class CrawlWorker extends AbstractCrawlWorker {
failreason = plasmaCrawlEURL.DENIED_NO_ROUTE_TO_HOST;
} else if ((e instanceof UnknownHostException) ||
((errorMsg != null) && (errorMsg.indexOf("unknown host") >= 0))) {
log.logWarning("CRAWLER Unknown host in URL '" + url.toString() + "'. " +
"Referer URL: " + ((refererURLString == null) ?"Unknown":refererURLString));
this.log.logWarning("CRAWLER Unknown host in URL '" + this.url.toString() + "'. " +
"Referer URL: " + ((this.refererURLString == null) ?"Unknown":this.refererURLString));
failreason = plasmaCrawlEURL.DENIED_UNKNOWN_HOST;
} else if (e instanceof java.net.BindException) {
log.logWarning("CRAWLER BindException detected while trying to download content from '" + url.toString() +
this.log.logWarning("CRAWLER BindException detected while trying to download content from '" + this.url.toString() +
"'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_BIND_EXCEPTION;
retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Corrupt GZIP trailer") >= 0)) {
log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + url.toString() +
this.log.logWarning("CRAWLER Problems detected while receiving gzip encoded content from '" + this.url.toString() +
"'. Retrying request without using gzip content encoding.");
failreason = plasmaCrawlEURL.DENIED_CONTENT_DECODING_ERROR;
retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Read timed out") >= 0)) {
log.logWarning("CRAWLER Read timeout while receiving content from '" + url.toString() +
this.log.logWarning("CRAWLER Read timeout while receiving content from '" + this.url.toString() +
"'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("connect timed out") >= 0)) {
log.logWarning("CRAWLER Timeout while trying to connect to '" + url.toString() +
this.log.logWarning("CRAWLER Timeout while trying to connect to '" + this.url.toString() +
"'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection timed out") >= 0)) {
log.logWarning("CRAWLER Connection timeout while receiving content from '" + url.toString() +
this.log.logWarning("CRAWLER Connection timeout while receiving content from '" + this.url.toString() +
"'. Retrying request.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_TIMEOUT;
retryCrawling = true;
} else if ((errorMsg != null) && (errorMsg.indexOf("Connection refused") >= 0)) {
log.logWarning("CRAWLER Connection refused while trying to connect to '" + url.toString() + "'.");
this.log.logWarning("CRAWLER Connection refused while trying to connect to '" + this.url.toString() + "'.");
failreason = plasmaCrawlEURL.DENIED_CONNECTION_REFUSED;
} else if ((errorMsg != null) && (errorMsg.indexOf("There is not enough space on the disk") >= 0)) {
log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + url.toString() + "'. " +
this.log.logSevere("CRAWLER Not enough space on the disk detected while crawling '" + this.url.toString() + "'. " +
"Pausing crawlers. ");
plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL);
plasmaCrawlLoader.switchboard.pauseCrawlJob(plasmaSwitchboard.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
failreason = plasmaCrawlEURL.DENIED_OUT_OF_DISK_SPACE;
} else if ((errorMsg != null) && (errorMsg.indexOf("Network is unreachable") >=0)) {
log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + url.toString() + "'. ");
this.log.logSevere("CRAWLER Network is unreachable while trying to crawl URL '" + this.url.toString() + "'. ");
failreason = plasmaCrawlEURL.DENIED_NETWORK_IS_UNREACHABLE;
} else if ((errorMsg != null) && (errorMsg.indexOf("No trusted certificate found")>= 0)) {
log.logSevere("CRAWLER No trusted certificate found for URL '" + url.toString() + "'. ");
this.log.logSevere("CRAWLER No trusted certificate found for URL '" + this.url.toString() + "'. ");
failreason = plasmaCrawlEURL.DENIED_SSL_UNTRUSTED_CERT;
} else {
log.logSevere("CRAWLER Unexpected Error with URL '" + url.toString() + "': " + e.toString(),e);
this.log.logSevere("CRAWLER Unexpected Error with URL '" + this.url.toString() + "': " + e.toString(),e);
failreason = plasmaCrawlEURL.DENIED_CONNECTION_ERROR;
}
if (retryCrawling) {
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
log.logSevere("CRAWLER Retry of URL=" + url.toString() + " aborted because of server shutdown.");
this.log.logSevere("CRAWLER Retry of URL=" + this.url.toString() + " aborted because of server shutdown.");
return null;
}

Loading…
Cancel
Save