- not doing merge-jobs while short on Memory

- using configuration-values of crawling-max-filesize also for snippetfetching and loading files into Index

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7893 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
sixcooler 14 years ago
parent 965fabfb87
commit 5f8a5ca32d

@ -48,7 +48,6 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.NoticedURL.StackType;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Segments;
@ -595,9 +594,8 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final int maxFileSize = CrawlQueues.this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true);
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), true);
if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) CrawlQueues.this.log.logFine("problem loading " + this.request.url().toString() + ": no content (possibly caused by cache policy)");

@ -2216,7 +2216,7 @@ public final class Switchboard extends serverSwitch {
@Override
public void run() {
try {
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, Integer.MAX_VALUE, true);
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, true);
if (response == null) {
throw new IOException("response == null");
}
@ -2610,7 +2610,7 @@ public final class Switchboard extends serverSwitch {
// if we have an url then try to load the rss
RSSReader rss = null;
try {
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true);
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, true);
final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);

@ -266,7 +266,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return null;
} else {
// try to load the resource from the cache
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, Integer.MAX_VALUE, true);
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CacheStrategy.NOCACHE : cacheStrategy, true);
if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {

@ -170,7 +170,7 @@ public class IODispatcher extends Thread {
}
// otherwise do a merge operation
if (!this.mergeQueue.isEmpty()) {
if (!this.mergeQueue.isEmpty() && !MemoryControl.shortStatus()) {
File f = null, f1 = null, f2 = null;
try {
mergeJob = this.mergeQueue.take();

@ -147,6 +147,10 @@ public final class LoaderDispatcher {
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException {
final String url = request.url().toNormalform(true, false);
@ -290,6 +294,16 @@ public final class LoaderDispatcher {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
private int protocolMaxFileSize(final DigestURI url) {
if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (url.isFTP())
return this.sb.getConfigInt("crawler.ftp.maxFileSize", (int) FTPLoader.DEFAULT_MAXFILESIZE);
if (url.isSMB())
return this.sb.getConfigInt("crawler.smb.maxFileSize", (int) SMBLoader.DEFAULT_MAXFILESIZE);
return Integer.MAX_VALUE;
}
/**
* load the url as byte[] content from the web or the cache
@ -301,8 +315,7 @@ public final class LoaderDispatcher {
*/
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException {
// try to download the resource using the loader
final int maxFileSize = this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(request, cacheStrategy, maxFileSize, false);
final Response entry = load(request, cacheStrategy, false);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -325,8 +338,7 @@ public final class LoaderDispatcher {
public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load page
final int maxFileSize = this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
final Response r = this.load(request(location, true, false), cachePolicy, false);
final byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());

Loading…
Cancel
Save