From 2126c03a62cc01a9e9bab55ab10335367a66b5a9 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 14 May 2010 18:30:11 +0000 Subject: [PATCH] - removed download-limit that can be given for the crawler for non-crawler download tasks. This was necessary because the same procedure was used for other downloads like for the download of dictionary files where a limit is not useful. The limit still stays for the indexer - migrated the opengeodb downloader to a new version of the opengeodb-dump git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6873 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 2 +- htroot/Crawler_p.java | 10 ++-- htroot/DictionaryLoader_p.html | 37 ++++++------- htroot/DictionaryLoader_p.java | 48 +++++++++-------- htroot/QuickCrawlLink_p.java | 2 +- htroot/RSSLoader_p.java | 2 +- htroot/ViewFile.java | 6 +-- htroot/api/util/getpageinfo_p.java | 4 +- htroot/yacysearch.java | 2 +- source/de/anomic/crawler/Balancer.java | 4 +- source/de/anomic/crawler/CrawlProfile.java | 35 ++++++++---- source/de/anomic/crawler/CrawlQueues.java | 4 +- .../de/anomic/crawler/CrawlSwitchboard.java | 14 ++--- .../anomic/crawler/retrieval/HTTPLoader.java | 13 +++-- source/de/anomic/data/LibraryProvider.java | 29 +++++----- source/de/anomic/data/SitemapParser.java | 2 +- source/de/anomic/data/bookmarksDB.java | 6 +-- .../anomic/http/client/ClientGetMethod.java | 18 ++++--- source/de/anomic/search/MediaSnippet.java | 2 +- source/de/anomic/search/TextSnippet.java | 2 +- source/de/anomic/yacy/graphics/OSMTile.java | 2 +- source/de/anomic/yacy/yacyRelease.java | 2 +- .../document/geolocalization/OpenGeoDB.java | 12 ++++- .../importer/OAIListFriendsLoader.java | 4 +- .../yacy/document/importer/OAIPMHLoader.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 54 ++++++++++--------- 26 files changed, 175 insertions(+), 143 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 6216f5afa..e3a4a16f2 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -189,7 +189,7 @@ public class Bookmarks { Document document = null; if (urlentry != null) { final URIMetadataRow.Components metadata = urlentry.metadata(); - document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false); + document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", metadata.url().toNormalform(false, true)); prop.putHTML("mode_title", metadata.dc_title()); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 8dcd1b5a9..1355e701c 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -194,11 +194,11 @@ public class Crawler_p { env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); final String cachePolicyString = post.get("cachePolicy", "iffresh"); - int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH; - if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE; - if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH; - if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST; - if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY; + CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; + if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE; + if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; + if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST; + if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY; final boolean xsstopw = post.get("xsstopw", "off").equals("on"); env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); diff --git a/htroot/DictionaryLoader_p.html b/htroot/DictionaryLoader_p.html index 8437a5b7c..6cc04f465 100644 --- a/htroot/DictionaryLoader_p.html +++ b/htroot/DictionaryLoader_p.html @@ -20,37 +20,38 @@ Geolocalization The geolocalization file will enable YaCy to present locations from OpenStreetMap according to given search words. With this file it is possible to find locations using the location (city) name, a zip code, a car sign or a telephone pre-dial number. +
-
#[geo0URL]#
+
#[geo1URL]#
-
#[geo0Storage]#
+
#[geo1Storage]#
-
#(geo0Status)#
not loaded
::
loaded
::de-activated#(/geo0Status)#
+
#(geo1Status)#
not loaded
::
loaded
::de-activated#(/geo1Status)#
-
#(geo0Status)# - :: - - :: - - - #(/geo0Status)#
- #(geo0ActionLoaded)#:: +
#(geo1Status)# + :: + + :: + + + #(/geo1Status)#
+ #(geo1ActionLoaded)#::
loaded and activated dictionary file
::
loading of dictionary file failed: #[error]#
- #(/geo0ActionLoaded)# - #(geo0ActionRemoved)#:: + #(/geo1ActionLoaded)# + #(geo1ActionRemoved)#::
de-activated and removed dictionary file
::
cannot remove dictionary file: #[error]#
- #(/geo0ActionRemoved)# - #(geo0ActionDeactivated)#:: + #(/geo1ActionRemoved)# + #(geo1ActionDeactivated)#::
de-activated dictionary file
::
cannot de-activate dictionary file: #[error]#
- #(/geo0ActionDeactivated)# - #(geo0ActionActivated)#:: + #(/geo1ActionDeactivated)# + #(geo1ActionActivated)#::
activated dictionary file
::
cannot activate dictionary file: #[error]#
- #(/geo0ActionActivated)# + #(/geo1ActionActivated)#
diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 2e801e153..c104c2b26 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -58,45 +58,47 @@ public class DictionaryLoader_p { if (post == null) return prop; - if (post.containsKey("geo0Load")) { + // GEO1 + if (post.containsKey("geo1Load")) { // load from the net try { - Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO0.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO1.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); byte[] b = response.getContent(); - FileUtils.copy(b, LibraryProvider.Dictionary.GEO0.file()); - LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file()); - prop.put("geo0Status", LibraryProvider.Dictionary.GEO0.file().exists() ? 1 : 0); - prop.put("geo0ActionLoaded", 1); + FileUtils.copy(b, LibraryProvider.Dictionary.GEO1.file()); + LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false); + prop.put("geo1Status", LibraryProvider.Dictionary.GEO1.file().exists() ? 1 : 0); + prop.put("geo1ActionLoaded", 1); } catch (MalformedURLException e) { Log.logException(e); - prop.put("geo0ActionLoaded", 2); - prop.put("geo0ActionLoaded_error", e.getMessage()); + prop.put("geo1ActionLoaded", 2); + prop.put("geo1ActionLoaded_error", e.getMessage()); } catch (IOException e) { Log.logException(e); - prop.put("geo0ActionLoaded", 2); - prop.put("geo0ActionLoaded_error", e.getMessage()); + prop.put("geo1ActionLoaded", 2); + prop.put("geo1ActionLoaded_error", e.getMessage()); } } - if (post.containsKey("geo0Remove")) { - FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.file()); - FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.fileDisabled()); - LibraryProvider.geoDB = new OpenGeoDB(null); - prop.put("geo0ActionRemoved", 1); + if (post.containsKey("geo1Remove")) { + FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.file()); + FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.fileDisabled()); + LibraryProvider.geoDB = new OpenGeoDB(null, true); + prop.put("geo1ActionRemoved", 1); } - if (post.containsKey("geo0Deactivate")) { - LibraryProvider.Dictionary.GEO0.file().renameTo(LibraryProvider.Dictionary.GEO0.fileDisabled()); - LibraryProvider.geoDB = new OpenGeoDB(null); - prop.put("geo0ActionDeactivated", 1); + if (post.containsKey("geo1Deactivate")) { + LibraryProvider.Dictionary.GEO1.file().renameTo(LibraryProvider.Dictionary.GEO1.fileDisabled()); + LibraryProvider.geoDB = new OpenGeoDB(null, true); + prop.put("geo1ActionDeactivated", 1); } - if (post.containsKey("geo0Activate")) { - LibraryProvider.Dictionary.GEO0.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO0.file()); - LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file()); - prop.put("geo0ActionActivated", 1); + if (post.containsKey("geo1Activate")) { + LibraryProvider.Dictionary.GEO1.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO1.file()); + LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false); + prop.put("geo1ActionActivated", 1); } + // check status again for (LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) { prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 2459ba783..bf8de8c21 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -163,7 +163,7 @@ public class QuickCrawlLink_p { xsstopw, xdstopw, xpstopw, - CrawlProfile.CACHE_STRATEGY_IFFRESH + CrawlProfile.CacheStrategy.IFFRESH ); } catch (final Exception e) { // mist diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java index f6c697817..159300f69 100644 --- a/htroot/RSSLoader_p.java +++ b/htroot/RSSLoader_p.java @@ -63,7 +63,7 @@ public class RSSLoader_p { // if the resource body was not cached we try to load it from web Response entry = null; try { - entry = sb.loader.load(url, true, false); + entry = sb.loader.load(url, true, false, Long.MAX_VALUE); } catch (final Exception e) { return prop; } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 6be2bb7f3..285a13dfc 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -182,7 +182,7 @@ public class ViewFile { // load resource from net Response response = null; try { - response = sb.loader.load(url, true, false); + response = sb.loader.load(url, true, false, Long.MAX_VALUE); } catch (IOException e) { Log.logException(e); } @@ -198,7 +198,7 @@ public class ViewFile { if (resource == null) { Response entry = null; try { - entry = sb.loader.load(url, true, false); + entry = sb.loader.load(url, true, false, Long.MAX_VALUE); } catch (final Exception e) { prop.put("error", "4"); prop.putHTML("error_errorText", e.getMessage()); @@ -238,7 +238,7 @@ public class ViewFile { } try { - Response response = sb.loader.load(url, true, false); + Response response = sb.loader.load(url, true, false, Long.MAX_VALUE); responseHeader = response.getResponseHeader(); resource = response.getContent(); } catch (IOException e) { diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index eb9f076b7..580c52cd3 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -54,11 +54,11 @@ public class getpageinfo_p { } ContentScraper scraper = null; if (u != null) try { - scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); + scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFFRESH); } catch (final IOException e) { // try again, try harder try { - scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST); + scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFEXIST); } catch (final IOException ee) { // now thats a fail, do nothing } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index ff4de462e..32db77d73 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -430,7 +430,7 @@ public class yacysearch { if (urlentry != null) { final URIMetadataRow.Components metadata = urlentry.metadata(); Document document; - document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false); + document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE); if (document != null) { // create a news message final HashMap map = new HashMap(); diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index a373c6375..6bf011408 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -385,8 +385,8 @@ public class Balancer { } // depending on the caching policy we need sleep time to avoid DoS-like situations sleeptime = ( - profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY || - (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url())) + profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY || + (profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url())) ) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + new String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes()); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 954acb2df..2a581dddf 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -170,7 +170,7 @@ public class CrawlProfile { final boolean storeHTCache, final boolean storeTXCache, final boolean remoteIndexing, final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, - final int cacheStrategy) { + final CacheStrategy cacheStrategy) { final entry ne = new entry( name, startURL, @@ -246,10 +246,23 @@ public class CrawlProfile { } - public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source - public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules - public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Otherwise use online source. - public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable + public static enum CacheStrategy { + NOCACHE(0), // never use the cache, all content from fresh internet source + IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules + IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source. + CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable + public int code; + private CacheStrategy(int code) { + this.code = code; + } + public String toString() { + return Integer.toString(this.code); + } + public static CacheStrategy decode(int code) { + for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy; + return NOCACHE; + } + } public static class entry { // this is a simple record structure that hold all properties of a single crawl start @@ -290,7 +303,7 @@ public class CrawlProfile { final boolean storeHTCache, final boolean storeTXCache, final boolean remoteIndexing, final boolean xsstopw, final boolean xdstopw, final boolean xpstopw, - final int cacheStrategy) { + final CacheStrategy cacheStrategy) { if (name == null || name.length() == 0) throw new NullPointerException("name must not be null"); final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, Word.commonHashLength) : new String(startURL.hash()); mem = new HashMap(40); @@ -312,7 +325,7 @@ public class CrawlProfile { mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words - mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy)); + mem.put(CACHE_STRAGEGY, cacheStrategy.toString()); doms = new ConcurrentHashMap(); } @@ -376,14 +389,14 @@ public class CrawlProfile { return 0; } } - public int cacheStrategy() { + public CacheStrategy cacheStrategy() { final String r = mem.get(CACHE_STRAGEGY); - if (r == null) return CACHE_STRATEGY_IFFRESH; + if (r == null) return CacheStrategy.IFFRESH; try { - return Integer.parseInt(r); + return CacheStrategy.decode(Integer.parseInt(r)); } catch (final NumberFormatException e) { Log.logException(e); - return CACHE_STRATEGY_IFFRESH; + return CacheStrategy.IFFRESH; } } public long recrawlIfOlder() { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 7a78283e5..386b921d9 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -45,6 +45,7 @@ import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.workflow.WorkflowJob; +import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.crawler.retrieval.Request; import de.anomic.crawler.retrieval.Response; import de.anomic.http.client.Client; @@ -561,7 +562,8 @@ public class CrawlQueues { // returns null if everything went fine, a fail reason string if a problem occurred try { request.setStatus("loading", WorkflowJob.STATUS_RUNNING); - Response response = sb.loader.load(request, true); + final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + Response response = sb.loader.load(request, true, maxFileSize); if (response == null) { request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)"); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 01b85460f..7db84194f 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -168,37 +168,37 @@ public final class CrawlSwitchboard { true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, - CrawlProfile.CACHE_STRATEGY_IFFRESH); + CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); + -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.CACHEONLY); } if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, - this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE); + this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); } } diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index ac2685b31..e0a8e23a0 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -45,7 +45,7 @@ public final class HTTPLoader { private static final String DEFAULT_ENCODING = "gzip,deflate"; private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5"; private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7"; - private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; + public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10; public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5; public static final String crawlerUserAgent = "yacybot (" + Client.getSystemOST() +") http://yacy.net/bot.html"; public static final String yacyUserAgent = "yacy (" + Client.getSystemOST() +") yacy.net"; @@ -74,14 +74,14 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000); } - public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException { + public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { long start = System.currentTimeMillis(); - Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT); + Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize); Latency.update(new String(entry.url().hash()).substring(6), entry.url().getHost(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException { + private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException { if (retryCount < 0) { sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded"); @@ -113,8 +113,7 @@ public final class HTTPLoader { // take a file from the net Response response = null; - final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE); - + // create a request header final RequestHeader requestHeader = new RequestHeader(); requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent); @@ -202,7 +201,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, acceptOnlyParseable, retryCount - 1); + return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize); } } else { // if the response has not the right response type then reject file diff --git a/source/de/anomic/data/LibraryProvider.java b/source/de/anomic/data/LibraryProvider.java index b45a534c3..e2de6183f 100644 --- a/source/de/anomic/data/LibraryProvider.java +++ b/source/de/anomic/data/LibraryProvider.java @@ -50,14 +50,17 @@ public class LibraryProvider { public static final String disabledExtension = ".disabled"; public static DidYouMeanLibrary dymLib = new DidYouMeanLibrary(null); - public static OpenGeoDB geoDB = new OpenGeoDB(null); + public static OpenGeoDB geoDB = new OpenGeoDB(null, true); private static File dictSource = null; private static File dictRoot = null; public static enum Dictionary { GEO0("geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz", - "opengeodb-0.2.5a-UTF8-sql.gz"); + "opengeodb-0.2.5a-UTF8-sql.gz"), + GEO1("geo1", + "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz", + "opengeodb-02621_2010-03-16.sql.gz"); public String nickname, url, filename; private Dictionary(String nickname, String url, String filename) { @@ -95,20 +98,16 @@ public class LibraryProvider { } public static void integrateOpenGeoDB() { - File ogdb = new File(dictSource, "opengeodb-0.2.5a-UTF8-sql.gz"); - if (ogdb.exists()) { - geoDB = new OpenGeoDB(ogdb); - return; + File geo1 = Dictionary.GEO1.file(); + File geo0 = Dictionary.GEO0.file(); + if (geo1.exists()) { + if (geo0.exists()) geo0.renameTo(Dictionary.GEO0.fileDisabled()); + geoDB = new OpenGeoDB(geo1, false); + return; } - ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql.gz"); - if (ogdb.exists()) { - geoDB = new OpenGeoDB(ogdb); - return; - } - ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql"); - if (ogdb.exists()) { - geoDB = new OpenGeoDB(ogdb); - return; + if (geo0.exists()) { + geoDB = new OpenGeoDB(geo0, true); + return; } } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 52bea8324..eb331a688 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -334,6 +334,6 @@ public class SitemapParser extends DefaultHandler { false, // exclude stop-words true, true, true, - CrawlProfile.CACHE_STRATEGY_IFFRESH); + CrawlProfile.CacheStrategy.IFFRESH); } } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 330dd3b1d..94f75cef6 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -161,7 +161,7 @@ public class bookmarksDB { Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH + Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH ); } if (parser.length == 14) { @@ -169,7 +169,7 @@ public class bookmarksDB { Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]), Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]), Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]), - Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13]) + Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13])) ); } } @@ -206,7 +206,7 @@ public class bookmarksDB { public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder, int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia, - boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) { + boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) { final Switchboard sb = Switchboard.getSwitchboard(); final Iterator bit = getBookmarksIterator(folder, true); diff --git a/source/de/anomic/http/client/ClientGetMethod.java b/source/de/anomic/http/client/ClientGetMethod.java index 8d4810026..f9cc3df8e 100644 --- a/source/de/anomic/http/client/ClientGetMethod.java +++ b/source/de/anomic/http/client/ClientGetMethod.java @@ -52,14 +52,16 @@ public class ClientGetMethod extends GetMethod { protected void readResponseHeaders(HttpState state, HttpConnection conn) throws IOException, HttpException { super.readResponseHeaders(state, conn); - // already processing the header to be able to throw an exception - Header contentlengthHeader = getResponseHeader("content-length"); - long contentlength = 0; - if (contentlengthHeader != null) { - try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { } - } - if (contentlength > maxfilesize) { - throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize); + if (this.maxfilesize < Long.MAX_VALUE) { + // already processing the header to be able to throw an exception + Header contentlengthHeader = getResponseHeader("content-length"); + long contentlength = 0; + if (contentlengthHeader != null) { + try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { } + } + if (contentlength > maxfilesize) { + throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize); + } } } } diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java index 9387291d7..bdd719916 100644 --- a/source/de/anomic/search/MediaSnippet.java +++ b/source/de/anomic/search/MediaSnippet.java @@ -116,7 +116,7 @@ public class MediaSnippet implements Comparable, Comparator(); } - final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing); + final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE); final ArrayList a = new ArrayList(); if (document != null) { if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO)); diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 38875fd51..1f3ab032e 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -359,7 +359,7 @@ public class TextSnippet implements Comparable, Comparator> predial2ids; private final HashMap zip2id; - public OpenGeoDB(final File file) { + public OpenGeoDB(final File file, boolean lonlat) { this.locTypeHash2locType = new HashMap(); this.id2loc = new HashMap(); @@ -98,6 +98,7 @@ public class OpenGeoDB { String[] v; Integer id; String h; + double lon, lat; while ((line = reader.readLine()) != null) { line = line.trim(); if (!line.startsWith("INSERT INTO ")) continue; @@ -107,7 +108,14 @@ public class OpenGeoDB { if (line.startsWith("geodb_coordinates ")) { line = line.substring(18 + 7);v = line.split(","); v = line.split(","); - id2loc.put(Integer.parseInt(v[0]), new Location(Double.parseDouble(v[2]), Double.parseDouble(v[3]))); + if (lonlat) { + lon = Double.parseDouble(v[2]); + lat = Double.parseDouble(v[3]); + } else { + lat = Double.parseDouble(v[2]); + lon = Double.parseDouble(v[3]); + } + id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat)); } if (line.startsWith("geodb_textdata ")) { line = line.substring(15 + 7); diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index 07a2d8872..99c559c48 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -58,7 +58,7 @@ public class OAIListFriendsLoader { public static void init(LoaderDispatcher loader, Map moreFriends) { listFriends.putAll(moreFriends); if (loader != null) for (Map.Entry oaiFriend: listFriends.entrySet()) { - loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue()); + loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue(), Long.MAX_VALUE); } } @@ -81,7 +81,7 @@ public class OAIListFriendsLoader { Map m; for (Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index c37841b57..4a8c5b700 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -48,7 +48,7 @@ public class OAIPMHLoader { this.source = source; // load the file from the net - Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + Response response = loader.load(source, false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); byte[] b = response.getContent(); this.resumptionToken = new ResumptionToken(source, b); //System.out.println("*** ResumptionToken = " + this.resumptionToken.toString()); diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 59cfc67fb..39ac98ec7 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -99,8 +99,9 @@ public final class LoaderDispatcher { public Response load( final DigestURI url, final boolean forText, - final boolean global) throws IOException { - return load(request(url, forText, global), forText); + final boolean global, + final long maxFileSize) throws IOException { + return load(request(url, forText, global), forText, maxFileSize); } /** @@ -116,13 +117,14 @@ public final class LoaderDispatcher { final DigestURI url, final boolean forText, final boolean global, - int cacheStratgy) throws IOException { - return load(request(url, forText, global), forText, cacheStratgy); + CrawlProfile.CacheStrategy cacheStratgy, + long maxFileSize) throws IOException { + return load(request(url, forText, global), forText, cacheStratgy, maxFileSize); } - public void load(final DigestURI url, int cacheStratgy, File targetFile) throws IOException { + public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException { - byte[] b = load(request(url, false, true), false, cacheStratgy).getContent(); + byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent(); if (b == null) throw new IOException("load == null"); File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -164,14 +166,14 @@ public final class LoaderDispatcher { 0); } - public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException { + public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException { CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); - int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFEXIST; + CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST; if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); - return load(request, acceptOnlyParseable, cacheStrategy); + return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize); } - public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException { + public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); final String host = request.url().getHost(); @@ -183,7 +185,7 @@ public final class LoaderDispatcher { // check if we have the page in the cache CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); - if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) { + if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) { // we have passed a first test if caching is allowed // now see if there is a cache entry @@ -214,14 +216,14 @@ public final class LoaderDispatcher { content); // check which caching strategy shall be used - if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) { + if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) { // well, just take the cache and don't care about freshness of the content log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false)); return response; } // now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test - assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy; + assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy; if (response.isFreshForProxy()) { log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false)); return response; @@ -232,7 +234,7 @@ public final class LoaderDispatcher { } // check case where we want results from the cache exclusively, and never from the internet (offline mode) - if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) { + if (cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) { // we had a chance to get the content from the cache .. its over. We don't have it. return null; } @@ -259,7 +261,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; - if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable); + if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize); if (protocol.equals("ftp")) response = ftpLoader.load(request, true); if (protocol.equals("smb")) response = smbLoader.load(request, true); if (response != null) { @@ -302,7 +304,8 @@ public final class LoaderDispatcher { if (!fetchOnline) return null; // try to download the resource using the loader - final Response entry = load(url, forText, reindexing); + final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + final Response entry = load(url, forText, reindexing, maxFileSize); if (entry == null) return null; // not found in web // read resource body (if it is there) @@ -321,7 +324,7 @@ public final class LoaderDispatcher { * @param global the domain of the search. If global == true then the content is re-indexed * @return the parsed document as {@link Document} */ - public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) { + public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) { // load resource byte[] resContent = null; @@ -336,7 +339,7 @@ public final class LoaderDispatcher { // if not found try to download it // download resource using the crawler and keep resource in memory if possible - final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global); + final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize); // getting resource metadata (e.g. the http headers for http resources) if (entry != null) { @@ -431,9 +434,10 @@ public final class LoaderDispatcher { } } - public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException { + public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException { // load page - Response r = loader.load(location, true, false, cachePolicy); + final long maxFileSize = loader.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); + Response r = loader.load(location, true, false, cachePolicy, maxFileSize); byte[] page = (r == null) ? null : r.getContent(); if (page == null) throw new IOException("no response from url " + location.toString()); @@ -455,25 +459,27 @@ public final class LoaderDispatcher { } } - public void loadIfNotExistBackground(String url, File cache) { - new Loader(url, cache).start(); + public void loadIfNotExistBackground(String url, File cache, long maxFileSize) { + new Loader(url, cache, maxFileSize).start(); } private class Loader extends Thread { private String url; private File cache; + private long maxFileSize; - public Loader(String url, File cache) { + public Loader(String url, File cache, long maxFileSize) { this.url = url; this.cache = cache; + this.maxFileSize = maxFileSize; } public void run() { if (this.cache.exists()) return; try { // load from the net - Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize); byte[] b = response.getContent(); FileUtils.copy(b, this.cache); } catch (MalformedURLException e) {} catch (IOException e) {}