diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index e4e611c05..bee81ba6e 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -198,7 +198,7 @@ public class Bookmarks { final URIMetadataRow urlentry = sb.indexSegments.urlMetadata(Segments.Process.PUBLIC).load(ASCII.getBytes(urlHash)); if (urlentry != null) try { final URIMetadataRow.Components metadata = urlentry.metadata(); - final Document document = Document.mergeDocuments(metadata.url(), null, sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE)); + final Document document = Document.mergeDocuments(metadata.url(), null, sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE)); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.putHTML("mode_title", metadata.dc_title()); diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index b9f21da20..c830bafdd 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -63,7 +63,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Long.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.addLocalization(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocalization(LibraryProvider.Dictionary.GEON0.file())); @@ -103,7 +103,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Long.MAX_VALUE, false); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, false); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.removeLocalization(LibraryProvider.Dictionary.GEODB0.nickname); diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 935e43c7b..2d6afcc60 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -255,7 +255,7 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true, false)); - final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Long.MAX_VALUE, true); + final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final IOException e) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index af090dbeb..2113854a7 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -172,7 +172,7 @@ public class ViewFile { Response response = null; try { - response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Long.MAX_VALUE, true); + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, true); } catch (final IOException e) { prop.put("error", "4"); prop.put("error_errorText", "error loading resource: " + e.getMessage()); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 5d6e8d239..c25266962 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -529,7 +529,7 @@ public class yacysearch { final URIMetadataRow.Components metadata = urlentry.metadata(); Document[] documents = null; try { - documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Long.MAX_VALUE); + documents = sb.loader.loadDocuments(sb.loader.request(metadata.url(), true, false), CacheStrategy.IFEXIST, 5000, Integer.MAX_VALUE); } catch (final IOException e) { } catch (final Parser.Failure e) { } diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index ddfa2b494..1ddd132e3 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -595,7 +595,7 @@ public class CrawlQueues { // returns null if everything went fine, a fail reason string if a problem occurred try { this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); - final long maxFileSize = CrawlQueues.this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + final int maxFileSize = CrawlQueues.this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), maxFileSize, true); if (response == null) { diff --git a/source/de/anomic/crawler/RSSLoader.java b/source/de/anomic/crawler/RSSLoader.java index 8a6b22d4a..70dde7331 100644 --- a/source/de/anomic/crawler/RSSLoader.java +++ b/source/de/anomic/crawler/RSSLoader.java @@ -62,7 +62,7 @@ public class RSSLoader extends Thread { public void run() { RSSReader rss = null; try { - final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Long.MAX_VALUE, true); + final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final MalformedURLException e) { diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 90825d7fa..5892ca3fc 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -51,7 +51,7 @@ public final class HTTPLoader { private static final String DEFAULT_ENCODING = "gzip,deflate"; private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5"; private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"; - public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; + public static final int DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; /** @@ -69,14 +69,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry, long maxFileSize, boolean checkBlacklist) throws IOException { + public Response load(final Request entry, final int maxFileSize, final boolean checkBlacklist) throws IOException { long start = System.currentTimeMillis(); Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, checkBlacklist); Latency.update(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, final int retryCount, final long maxFileSize, final boolean checkBlacklist) throws IOException { + private Response load(final Request request, final int retryCount, final int maxFileSize, final boolean checkBlacklist) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection counter exceeded", -1); @@ -243,7 +243,7 @@ public final class HTTPLoader { final HTTPClient client = new HTTPClient(); client.setTimout(20000); client.setHeader(requestHeader.entrySet()); - final byte[] responseBody = client.GETbytes(request.url(), Long.MAX_VALUE); + final byte[] responseBody = client.GETbytes(request.url()); final ResponseHeader header = new ResponseHeader(client.getHttpResponse().getAllHeaders()); final int code = client.getHttpResponse().getStatusLine().getStatusCode(); // FIXME: 30*-handling (bottom) is never reached diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index 90cbf4fcf..3feef7d9b 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -62,7 +62,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle return null; } try { - response = this.loader.load(this.loader.request(uri, true, false), CacheStrategy.IFEXIST, Long.MAX_VALUE, true); + response = this.loader.load(this.loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); } catch (final IOException e) { Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url); try { diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java index 032681deb..96c3371db 100644 --- a/source/de/anomic/data/ymark/YMarkMetadata.java +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -96,7 +96,7 @@ public class YMarkMetadata { public void loadDocument(final LoaderDispatcher loader) throws IOException, Failure { if(this.document == null) { Response response = null; - response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Long.MAX_VALUE, true); + response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } } diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index c59ebca87..0b71c62f4 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -128,7 +128,7 @@ public class MediaSnippet implements Comparable, Comparator(); diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index d8a590d1f..c6d2445f8 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -417,7 +417,7 @@ public class Segment { try { // parse the resource - final Document document = Document.mergeDocuments(metadata.url(), null, loader.loadDocuments(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Long.MAX_VALUE)); + final Document document = Document.mergeDocuments(metadata.url(), null, loader.loadDocuments(loader.request(metadata.url(), true, false), cacheStrategy, 10000, Integer.MAX_VALUE)); if (document == null) { // delete just the url entry urlMetadata().remove(urlhash); diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 3d21f00a0..529062e3c 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1897,21 +1897,19 @@ public final class Switchboard extends serverSwitch { // PARSE CONTENT final long parsingStartTime = System.currentTimeMillis(); - // fetch the document from the response - byte[] b = response.getContent(); - if (b == null) { + if (response.getContent() == null) { // fetch the document from cache - b = Cache.getContent(response.url().hash()); - if (b == null) { + response.setContent(Cache.getContent(response.url().hash())); + if (response.getContent() == null) { this.log.logWarning("the resource '" + response.url() + "' is missing in the cache."); addURLtoErrorDB(response.url(), response.referrerHash(), response.initiator(), response.name(), FailCategory.FINAL_LOAD_CONTEXT, "missing in cache"); return null; } } - assert b != null; + assert response.getContent() != null; try { // parse the document - documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), b); + documents = TextParser.parseSource(response.url(), response.getMimeType(), response.getCharacterEncoding(), response.getContent()); if (documents == null) { throw new Parser.Failure("Parser returned null.", response.url()); } @@ -2218,7 +2216,7 @@ public final class Switchboard extends serverSwitch { @Override public void run() { try { - final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE, true); + final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, Integer.MAX_VALUE, true); if (response == null) { throw new IOException("response == null"); } @@ -2612,7 +2610,7 @@ public final class Switchboard extends serverSwitch { // if we have an url then try to load the rss RSSReader rss = null; try { - final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Long.MAX_VALUE, true); + final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); final byte[] resource = (response == null) ? null : response.getContent(); //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); diff --git a/source/de/anomic/yacy/graphics/OSMTile.java b/source/de/anomic/yacy/graphics/OSMTile.java index 2b06c06d5..ec33241d9 100644 --- a/source/de/anomic/yacy/graphics/OSMTile.java +++ b/source/de/anomic/yacy/graphics/OSMTile.java @@ -111,7 +111,7 @@ public class OSMTile { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { - entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Long.MAX_VALUE, true); + entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); } catch (final IOException e) { Log.logWarning("OSMTile", "cannot load: " + e.getMessage()); return null; diff --git a/source/net/yacy/cora/protocol/http/HTTPClient.java b/source/net/yacy/cora/protocol/http/HTTPClient.java index 8ca6572ac..81bf360ee 100644 --- a/source/net/yacy/cora/protocol/http/HTTPClient.java +++ b/source/net/yacy/cora/protocol/http/HTTPClient.java @@ -87,6 +87,7 @@ import org.apache.http.params.HttpProtocolParams; import org.apache.http.protocol.BasicHttpContext; import org.apache.http.protocol.HTTP; import org.apache.http.protocol.HttpContext; +import org.apache.http.util.ByteArrayBuffer; import org.apache.http.util.EntityUtils; @@ -288,7 +289,7 @@ public class HTTPClient { * @throws IOException */ public byte[] GETbytes(final String uri) throws IOException { - return GETbytes(uri, Long.MAX_VALUE); + return GETbytes(uri, Integer.MAX_VALUE); } /** @@ -299,7 +300,7 @@ public class HTTPClient { * @throws IOException */ public byte[] GETbytes(final MultiProtocolURI url) throws IOException { - return GETbytes(url, Long.MAX_VALUE); + return GETbytes(url, Integer.MAX_VALUE); } /** @@ -310,7 +311,7 @@ public class HTTPClient { * @return content bytes * @throws IOException */ - public byte[] GETbytes(final String uri, final long maxBytes) throws IOException { + public byte[] GETbytes(final String uri, final int maxBytes) throws IOException { return GETbytes(new MultiProtocolURI(uri), maxBytes); } @@ -323,7 +324,7 @@ public class HTTPClient { * @return content bytes * @throws IOException */ - public byte[] GETbytes(final MultiProtocolURI url, final long maxBytes) throws IOException { + public byte[] GETbytes(final MultiProtocolURI url, final int maxBytes) throws IOException { final boolean localhost = url.getHost().equals("localhost"); final String urix = url.toNormalform(true, false, !localhost, false); final HttpGet httpGet = new HttpGet(urix); @@ -431,7 +432,7 @@ public class HTTPClient { httpPost.setEntity(multipartEntity); } - return getContentBytes(httpPost, Long.MAX_VALUE); + return getContentBytes(httpPost, Integer.MAX_VALUE); } /** @@ -454,7 +455,7 @@ public class HTTPClient { // statistics this.upbytes = length; httpPost.setEntity(inputStreamEntity); - return getContentBytes(httpPost, Long.MAX_VALUE); + return getContentBytes(httpPost, Integer.MAX_VALUE); } /** @@ -543,8 +544,7 @@ public class HTTPClient { } } - private byte[] getContentBytes(final HttpUriRequest httpUriRequest, final long maxBytes) throws IOException { - byte[] content = null; + private byte[] getContentBytes(final HttpUriRequest httpUriRequest, final int maxBytes) throws IOException { try { execute(httpUriRequest); if (this.httpResponse == null) return null; @@ -552,22 +552,18 @@ public class HTTPClient { final HttpEntity httpEntity = this.httpResponse.getEntity(); if (httpEntity != null) { if (getStatusCode() == 200 && httpEntity.getContentLength() < maxBytes) { - try { - content = EntityUtils.toByteArray(httpEntity); - } catch (final OutOfMemoryError e) { - throw new IOException(e.toString()); - } + return getByteArray(httpEntity, maxBytes); } // Ensures that the entity content is fully consumed and the content stream, if exists, is closed. EntityUtils.consume(httpEntity); } + return null; } catch (final IOException e) { - ConnectionInfo.removeConnection(httpUriRequest.hashCode()); httpUriRequest.abort(); throw e; + } finally { + ConnectionInfo.removeConnection(httpUriRequest.hashCode()); } - ConnectionInfo.removeConnection(httpUriRequest.hashCode()); - return content; } private void execute(final HttpUriRequest httpUriRequest) throws IOException { @@ -598,6 +594,32 @@ public class HTTPClient { } } + private byte[] getByteArray(final HttpEntity entity, final int maxBytes) throws IOException { + final InputStream instream = entity.getContent(); + if (instream == null) { + return null; + } + try { + int i = Math.min(maxBytes, (int)entity.getContentLength()); + if (i < 0) { + i = 4096; + } + final ByteArrayBuffer buffer = new ByteArrayBuffer(i); + byte[] tmp = new byte[4096]; + int l, sum = 0; + while((l = instream.read(tmp)) != -1) { + sum += l; + if (sum > maxBytes) throw new IOException("Download exceeded maximum value of " + maxBytes + " bytes"); + buffer.append(tmp, 0, l); + } + return buffer.toByteArray(); + } catch (final OutOfMemoryError e) { + throw new IOException(e.toString()); + } finally { + instream.close(); + } + } + private void setHeaders(final HttpUriRequest httpUriRequest) { if (this.headers != null) { for (final Entry entry : this.headers) { diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index ca663f07e..9af9011af 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -59,7 +59,7 @@ public class OAIListFriendsLoader { public static void init(final LoaderDispatcher loader, final Map moreFriends) { listFriends.putAll(moreFriends); if (loader != null) for (final Map.Entry oaiFriend: listFriends.entrySet()) { - loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue(), Long.MAX_VALUE); + loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue(), Integer.MAX_VALUE); } } @@ -82,7 +82,7 @@ public class OAIListFriendsLoader { Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Long.MAX_VALUE, true); + final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index 889423568..b1069cf14 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -47,7 +47,7 @@ public class OAIPMHLoader { this.source = source; // load the file from the net - final Response response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Long.MAX_VALUE, true); + final Response response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, true); final byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(source, b); //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index d8e37eed6..2e0ed9808 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -136,7 +136,7 @@ public final class LoaderDispatcher { 0); } - public void load(final DigestURI url, final CacheStrategy cacheStratgy, final long maxFileSize, final File targetFile) throws IOException { + public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile) throws IOException { final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent(); if (b == null) throw new IOException("load == null"); @@ -149,7 +149,7 @@ public final class LoaderDispatcher { tmp.renameTo(targetFile); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final long maxFileSize, final boolean checkBlacklist) throws IOException { + public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException { final String url = request.url().toNormalform(true, false); Semaphore check = this.loaderSteering.get(url); if (check != null) { @@ -181,7 +181,7 @@ public final class LoaderDispatcher { * @return the loaded entity in a Response object * @throws IOException */ - private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final long maxFileSize, final boolean checkBlacklist) throws IOException { + private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final boolean checkBlacklist) throws IOException { // get the protocol of the next URL final DigestURI url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system @@ -302,7 +302,7 @@ public final class LoaderDispatcher { */ public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy) throws IOException { // try to download the resource using the loader - final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + final int maxFileSize = this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final Response entry = load(request, cacheStrategy, maxFileSize, false); if (entry == null) return null; // not found in web @@ -310,7 +310,7 @@ public final class LoaderDispatcher { return entry.getContent(); } - public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final long maxFileSize) throws IOException, Parser.Failure { + public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int timeout, final int maxFileSize) throws IOException, Parser.Failure { // load resource final Response response = load(request, cacheStrategy, maxFileSize, false); @@ -326,7 +326,7 @@ public final class LoaderDispatcher { public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException { // load page - final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + final int maxFileSize = this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); final Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false); final byte[] page = (r == null) ? null : r.getContent(); if (page == null) throw new IOException("no response from url " + location.toString()); @@ -346,23 +346,20 @@ public final class LoaderDispatcher { * @throws IOException */ public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy) throws IOException { - final Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false); + final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, false); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); - byte[] resource = response.getContent(); - if (resource == null) throw new IOException("resource == null"); + if (response.getContent() == null) throw new IOException("resource == null"); if (responseHeader == null) throw new IOException("responseHeader == null"); Document[] documents = null; final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource)); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); - } finally { - resource = null; } return Document.getHyperlinks(documents); @@ -378,11 +375,11 @@ public final class LoaderDispatcher { } } - public void loadIfNotExistBackground(final String url, final File cache, final long maxFileSize) { + public void loadIfNotExistBackground(final String url, final File cache, final int maxFileSize) { new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST).start(); } - public void loadIfNotExistBackground(final String url, final long maxFileSize) { + public void loadIfNotExistBackground(final String url, final int maxFileSize) { new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST).start(); } @@ -390,10 +387,10 @@ public final class LoaderDispatcher { private final String url; private final File cache; - private final long maxFileSize; + private final int maxFileSize; private final CacheStrategy cacheStrategy; - public Loader(final String url, final File cache, final long maxFileSize, final CacheStrategy cacheStrategy) { + public Loader(final String url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy) { this.url = url; this.cache = cache; this.maxFileSize = maxFileSize;