diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java
index 6216f5afa..e3a4a16f2 100644
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@@ -189,7 +189,7 @@ public class Bookmarks {
Document document = null;
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
- document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false);
+ document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title());
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 8dcd1b5a9..1355e701c 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -194,11 +194,11 @@ public class Crawler_p {
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
- int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
- if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
- if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
- if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
- if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
+ CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+ if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
+ if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+ if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
+ if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
diff --git a/htroot/DictionaryLoader_p.html b/htroot/DictionaryLoader_p.html
index 8437a5b7c..6cc04f465 100644
--- a/htroot/DictionaryLoader_p.html
+++ b/htroot/DictionaryLoader_p.html
@@ -20,37 +20,38 @@
The geolocalization file will enable YaCy to present locations from OpenStreetMap according to given search words.
With this file it is possible to find locations using the location (city) name, a zip code, a car sign or a telephone pre-dial number.
+
- - #[geo0URL]#
+ - #[geo1URL]#
- - #[geo0Storage]#
+ - #[geo1Storage]#
- - #(geo0Status)#
not loaded
::loaded
::de-activated#(/geo0Status)#
+ - #(geo1Status)#
not loaded
::loaded
::de-activated#(/geo1Status)#
- - #(geo0Status)#
- ::
-
- ::
-
-
- #(/geo0Status)#
- #(geo0ActionLoaded)#::
+ - #(geo1Status)#
+ ::
+
+ ::
+
+
+ #(/geo1Status)#
+ #(geo1ActionLoaded)#::
loaded and activated dictionary file
::
loading of dictionary file failed: #[error]#
- #(/geo0ActionLoaded)#
- #(geo0ActionRemoved)#::
+ #(/geo1ActionLoaded)#
+ #(geo1ActionRemoved)#::
de-activated and removed dictionary file
::
cannot remove dictionary file: #[error]#
- #(/geo0ActionRemoved)#
- #(geo0ActionDeactivated)#::
+ #(/geo1ActionRemoved)#
+ #(geo1ActionDeactivated)#::
de-activated dictionary file
::
cannot de-activate dictionary file: #[error]#
- #(/geo0ActionDeactivated)#
- #(geo0ActionActivated)#::
+ #(/geo1ActionDeactivated)#
+ #(geo1ActionActivated)#::
activated dictionary file
::
cannot activate dictionary file: #[error]#
- #(/geo0ActionActivated)#
+ #(/geo1ActionActivated)#
diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java
index 2e801e153..c104c2b26 100644
--- a/htroot/DictionaryLoader_p.java
+++ b/htroot/DictionaryLoader_p.java
@@ -58,45 +58,47 @@ public class DictionaryLoader_p {
if (post == null) return prop;
- if (post.containsKey("geo0Load")) {
+ // GEO1
+ if (post.containsKey("geo1Load")) {
// load from the net
try {
- Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO0.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+ Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO1.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
- FileUtils.copy(b, LibraryProvider.Dictionary.GEO0.file());
- LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file());
- prop.put("geo0Status", LibraryProvider.Dictionary.GEO0.file().exists() ? 1 : 0);
- prop.put("geo0ActionLoaded", 1);
+ FileUtils.copy(b, LibraryProvider.Dictionary.GEO1.file());
+ LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false);
+ prop.put("geo1Status", LibraryProvider.Dictionary.GEO1.file().exists() ? 1 : 0);
+ prop.put("geo1ActionLoaded", 1);
} catch (MalformedURLException e) {
Log.logException(e);
- prop.put("geo0ActionLoaded", 2);
- prop.put("geo0ActionLoaded_error", e.getMessage());
+ prop.put("geo1ActionLoaded", 2);
+ prop.put("geo1ActionLoaded_error", e.getMessage());
} catch (IOException e) {
Log.logException(e);
- prop.put("geo0ActionLoaded", 2);
- prop.put("geo0ActionLoaded_error", e.getMessage());
+ prop.put("geo1ActionLoaded", 2);
+ prop.put("geo1ActionLoaded_error", e.getMessage());
}
}
- if (post.containsKey("geo0Remove")) {
- FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.file());
- FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.fileDisabled());
- LibraryProvider.geoDB = new OpenGeoDB(null);
- prop.put("geo0ActionRemoved", 1);
+ if (post.containsKey("geo1Remove")) {
+ FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.file());
+ FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.fileDisabled());
+ LibraryProvider.geoDB = new OpenGeoDB(null, true);
+ prop.put("geo1ActionRemoved", 1);
}
- if (post.containsKey("geo0Deactivate")) {
- LibraryProvider.Dictionary.GEO0.file().renameTo(LibraryProvider.Dictionary.GEO0.fileDisabled());
- LibraryProvider.geoDB = new OpenGeoDB(null);
- prop.put("geo0ActionDeactivated", 1);
+ if (post.containsKey("geo1Deactivate")) {
+ LibraryProvider.Dictionary.GEO1.file().renameTo(LibraryProvider.Dictionary.GEO1.fileDisabled());
+ LibraryProvider.geoDB = new OpenGeoDB(null, true);
+ prop.put("geo1ActionDeactivated", 1);
}
- if (post.containsKey("geo0Activate")) {
- LibraryProvider.Dictionary.GEO0.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO0.file());
- LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file());
- prop.put("geo0ActionActivated", 1);
+ if (post.containsKey("geo1Activate")) {
+ LibraryProvider.Dictionary.GEO1.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO1.file());
+ LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false);
+ prop.put("geo1ActionActivated", 1);
}
+
// check status again
for (LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0);
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 2459ba783..bf8de8c21 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -163,7 +163,7 @@ public class QuickCrawlLink_p {
xsstopw,
xdstopw,
xpstopw,
- CrawlProfile.CACHE_STRATEGY_IFFRESH
+ CrawlProfile.CacheStrategy.IFFRESH
);
} catch (final Exception e) {
// mist
diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java
index f6c697817..159300f69 100644
--- a/htroot/RSSLoader_p.java
+++ b/htroot/RSSLoader_p.java
@@ -63,7 +63,7 @@ public class RSSLoader_p {
// if the resource body was not cached we try to load it from web
Response entry = null;
try {
- entry = sb.loader.load(url, true, false);
+ entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
return prop;
}
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 6be2bb7f3..285a13dfc 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -182,7 +182,7 @@ public class ViewFile {
// load resource from net
Response response = null;
try {
- response = sb.loader.load(url, true, false);
+ response = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (IOException e) {
Log.logException(e);
}
@@ -198,7 +198,7 @@ public class ViewFile {
if (resource == null) {
Response entry = null;
try {
- entry = sb.loader.load(url, true, false);
+ entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
@@ -238,7 +238,7 @@ public class ViewFile {
}
try {
- Response response = sb.loader.load(url, true, false);
+ Response response = sb.loader.load(url, true, false, Long.MAX_VALUE);
responseHeader = response.getResponseHeader();
resource = response.getContent();
} catch (IOException e) {
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index eb9f076b7..580c52cd3 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -54,11 +54,11 @@ public class getpageinfo_p {
}
ContentScraper scraper = null;
if (u != null) try {
- scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFFRESH);
} catch (final IOException e) {
// try again, try harder
try {
- scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+ scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFEXIST);
} catch (final IOException ee) {
// now thats a fail, do nothing
}
diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java
index ff4de462e..32db77d73 100644
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@@ -430,7 +430,7 @@ public class yacysearch {
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
Document document;
- document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false);
+ document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
if (document != null) {
// create a news message
final HashMap map = new HashMap();
diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java
index a373c6375..6bf011408 100644
--- a/source/de/anomic/crawler/Balancer.java
+++ b/source/de/anomic/crawler/Balancer.java
@@ -385,8 +385,8 @@ public class Balancer {
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (
- profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
- (profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
+ profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY ||
+ (profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + new String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 954acb2df..2a581dddf 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -170,7 +170,7 @@ public class CrawlProfile {
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
- final int cacheStrategy) {
+ final CacheStrategy cacheStrategy) {
final entry ne = new entry(
name, startURL,
@@ -246,10 +246,23 @@ public class CrawlProfile {
}
- public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source
- public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules
- public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
- public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable
+ public static enum CacheStrategy {
+ NOCACHE(0), // never use the cache, all content from fresh internet source
+ IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
+ IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
+ CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
+ public int code;
+ private CacheStrategy(int code) {
+ this.code = code;
+ }
+ public String toString() {
+ return Integer.toString(this.code);
+ }
+ public static CacheStrategy decode(int code) {
+ for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
+ return NOCACHE;
+ }
+ }
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
@@ -290,7 +303,7 @@ public class CrawlProfile {
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
- final int cacheStrategy) {
+ final CacheStrategy cacheStrategy) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, Word.commonHashLength) : new String(startURL.hash());
mem = new HashMap(40);
@@ -312,7 +325,7 @@ public class CrawlProfile {
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
- mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy));
+ mem.put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap();
}
@@ -376,14 +389,14 @@ public class CrawlProfile {
return 0;
}
}
- public int cacheStrategy() {
+ public CacheStrategy cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY);
- if (r == null) return CACHE_STRATEGY_IFFRESH;
+ if (r == null) return CacheStrategy.IFFRESH;
try {
- return Integer.parseInt(r);
+ return CacheStrategy.decode(Integer.parseInt(r));
} catch (final NumberFormatException e) {
Log.logException(e);
- return CACHE_STRATEGY_IFFRESH;
+ return CacheStrategy.IFFRESH;
}
}
public long recrawlIfOlder() {
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 7a78283e5..386b921d9 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -45,6 +45,7 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
+import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Client;
@@ -561,7 +562,8 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
- Response response = sb.loader.load(request, true);
+ final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
+ Response response = sb.loader.load(request, true, maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index 01b85460f..7db84194f 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -168,37 +168,37 @@ public final class CrawlSwitchboard {
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
- CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY);
+ this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.CACHEONLY);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+ this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
+ this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
- this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+ this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
}
}
diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java
index ac2685b31..e0a8e23a0 100644
--- a/source/de/anomic/crawler/retrieval/HTTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java
@@ -45,7 +45,7 @@ public final class HTTPLoader {
private static final String DEFAULT_ENCODING = "gzip,deflate";
private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5";
private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7";
- private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
+ public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5;
public static final String crawlerUserAgent = "yacybot (" + Client.getSystemOST() +") http://yacy.net/bot.html";
public static final String yacyUserAgent = "yacy (" + Client.getSystemOST() +") yacy.net";
@@ -74,14 +74,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
- public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException {
+ public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
long start = System.currentTimeMillis();
- Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT);
+ Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Latency.update(new String(entry.url().hash()).substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return doc;
}
- private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
+ private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@@ -113,8 +113,7 @@ public final class HTTPLoader {
// take a file from the net
Response response = null;
- final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE);
-
+
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
@@ -202,7 +201,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
- return load(request, acceptOnlyParseable, retryCount - 1);
+ return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize);
}
} else {
// if the response has not the right response type then reject file
diff --git a/source/de/anomic/data/LibraryProvider.java b/source/de/anomic/data/LibraryProvider.java
index b45a534c3..e2de6183f 100644
--- a/source/de/anomic/data/LibraryProvider.java
+++ b/source/de/anomic/data/LibraryProvider.java
@@ -50,14 +50,17 @@ public class LibraryProvider {
public static final String disabledExtension = ".disabled";
public static DidYouMeanLibrary dymLib = new DidYouMeanLibrary(null);
- public static OpenGeoDB geoDB = new OpenGeoDB(null);
+ public static OpenGeoDB geoDB = new OpenGeoDB(null, true);
private static File dictSource = null;
private static File dictRoot = null;
public static enum Dictionary {
GEO0("geo0",
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz",
- "opengeodb-0.2.5a-UTF8-sql.gz");
+ "opengeodb-0.2.5a-UTF8-sql.gz"),
+ GEO1("geo1",
+ "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz",
+ "opengeodb-02621_2010-03-16.sql.gz");
public String nickname, url, filename;
private Dictionary(String nickname, String url, String filename) {
@@ -95,20 +98,16 @@ public class LibraryProvider {
}
public static void integrateOpenGeoDB() {
- File ogdb = new File(dictSource, "opengeodb-0.2.5a-UTF8-sql.gz");
- if (ogdb.exists()) {
- geoDB = new OpenGeoDB(ogdb);
- return;
+ File geo1 = Dictionary.GEO1.file();
+ File geo0 = Dictionary.GEO0.file();
+ if (geo1.exists()) {
+ if (geo0.exists()) geo0.renameTo(Dictionary.GEO0.fileDisabled());
+ geoDB = new OpenGeoDB(geo1, false);
+ return;
}
- ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql.gz");
- if (ogdb.exists()) {
- geoDB = new OpenGeoDB(ogdb);
- return;
- }
- ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql");
- if (ogdb.exists()) {
- geoDB = new OpenGeoDB(ogdb);
- return;
+ if (geo0.exists()) {
+ geoDB = new OpenGeoDB(geo0, true);
+ return;
}
}
diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java
index 52bea8324..eb331a688 100644
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@@ -334,6 +334,6 @@ public class SitemapParser extends DefaultHandler {
false,
// exclude stop-words
true, true, true,
- CrawlProfile.CACHE_STRATEGY_IFFRESH);
+ CrawlProfile.CacheStrategy.IFFRESH);
}
}
diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java
index 330dd3b1d..94f75cef6 100644
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@@ -161,7 +161,7 @@ public class bookmarksDB {
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
- Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH
+ Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH
);
}
if (parser.length == 14) {
@@ -169,7 +169,7 @@ public class bookmarksDB {
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
- Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13])
+ Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13]))
);
}
}
@@ -206,7 +206,7 @@ public class bookmarksDB {
public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder,
int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
- boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) {
+ boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) {
final Switchboard sb = Switchboard.getSwitchboard();
final Iterator bit = getBookmarksIterator(folder, true);
diff --git a/source/de/anomic/http/client/ClientGetMethod.java b/source/de/anomic/http/client/ClientGetMethod.java
index 8d4810026..f9cc3df8e 100644
--- a/source/de/anomic/http/client/ClientGetMethod.java
+++ b/source/de/anomic/http/client/ClientGetMethod.java
@@ -52,14 +52,16 @@ public class ClientGetMethod extends GetMethod {
protected void readResponseHeaders(HttpState state, HttpConnection conn) throws IOException, HttpException {
super.readResponseHeaders(state, conn);
- // already processing the header to be able to throw an exception
- Header contentlengthHeader = getResponseHeader("content-length");
- long contentlength = 0;
- if (contentlengthHeader != null) {
- try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { }
- }
- if (contentlength > maxfilesize) {
- throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize);
+ if (this.maxfilesize < Long.MAX_VALUE) {
+ // already processing the header to be able to throw an exception
+ Header contentlengthHeader = getResponseHeader("content-length");
+ long contentlength = 0;
+ if (contentlengthHeader != null) {
+ try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { }
+ }
+ if (contentlength > maxfilesize) {
+ throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize);
+ }
}
}
}
diff --git a/source/de/anomic/search/MediaSnippet.java b/source/de/anomic/search/MediaSnippet.java
index 9387291d7..bdd719916 100644
--- a/source/de/anomic/search/MediaSnippet.java
+++ b/source/de/anomic/search/MediaSnippet.java
@@ -116,7 +116,7 @@ public class MediaSnippet implements Comparable, Comparator();
}
- final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing);
+ final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE);
final ArrayList a = new ArrayList();
if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));
diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java
index 38875fd51..1f3ab032e 100644
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@@ -359,7 +359,7 @@ public class TextSnippet implements Comparable, Comparator> predial2ids;
private final HashMap zip2id;
- public OpenGeoDB(final File file) {
+ public OpenGeoDB(final File file, boolean lonlat) {
this.locTypeHash2locType = new HashMap();
this.id2loc = new HashMap();
@@ -98,6 +98,7 @@ public class OpenGeoDB {
String[] v;
Integer id;
String h;
+ double lon, lat;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.startsWith("INSERT INTO ")) continue;
@@ -107,7 +108,14 @@ public class OpenGeoDB {
if (line.startsWith("geodb_coordinates ")) {
line = line.substring(18 + 7);v = line.split(",");
v = line.split(",");
- id2loc.put(Integer.parseInt(v[0]), new Location(Double.parseDouble(v[2]), Double.parseDouble(v[3])));
+ if (lonlat) {
+ lon = Double.parseDouble(v[2]);
+ lat = Double.parseDouble(v[3]);
+ } else {
+ lat = Double.parseDouble(v[2]);
+ lon = Double.parseDouble(v[3]);
+ }
+ id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat));
}
if (line.startsWith("geodb_textdata ")) {
line = line.substring(15 + 7);
diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java
index 07a2d8872..99c559c48 100644
--- a/source/net/yacy/document/importer/OAIListFriendsLoader.java
+++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java
@@ -58,7 +58,7 @@ public class OAIListFriendsLoader {
public static void init(LoaderDispatcher loader, Map moreFriends) {
listFriends.putAll(moreFriends);
if (loader != null) for (Map.Entry oaiFriend: listFriends.entrySet()) {
- loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue());
+ loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue(), Long.MAX_VALUE);
}
}
@@ -81,7 +81,7 @@ public class OAIListFriendsLoader {
Map m;
for (Map.Entry oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
- Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+ Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}
diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java
index c37841b57..4a8c5b700 100644
--- a/source/net/yacy/document/importer/OAIPMHLoader.java
+++ b/source/net/yacy/document/importer/OAIPMHLoader.java
@@ -48,7 +48,7 @@ public class OAIPMHLoader {
this.source = source;
// load the file from the net
- Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+ Response response = loader.load(source, false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());
diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java
index 59cfc67fb..39ac98ec7 100644
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@@ -99,8 +99,9 @@ public final class LoaderDispatcher {
public Response load(
final DigestURI url,
final boolean forText,
- final boolean global) throws IOException {
- return load(request(url, forText, global), forText);
+ final boolean global,
+ final long maxFileSize) throws IOException {
+ return load(request(url, forText, global), forText, maxFileSize);
}
/**
@@ -116,13 +117,14 @@ public final class LoaderDispatcher {
final DigestURI url,
final boolean forText,
final boolean global,
- int cacheStratgy) throws IOException {
- return load(request(url, forText, global), forText, cacheStratgy);
+ CrawlProfile.CacheStrategy cacheStratgy,
+ long maxFileSize) throws IOException {
+ return load(request(url, forText, global), forText, cacheStratgy, maxFileSize);
}
- public void load(final DigestURI url, int cacheStratgy, File targetFile) throws IOException {
+ public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
- byte[] b = load(request(url, false, true), false, cacheStratgy).getContent();
+ byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@@ -164,14 +166,14 @@ public final class LoaderDispatcher {
0);
}
- public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException {
+ public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
- int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
+ CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
- return load(request, acceptOnlyParseable, cacheStrategy);
+ return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize);
}
- public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException {
+ public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
@@ -183,7 +185,7 @@ public final class LoaderDispatcher {
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
- if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
+ if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
@@ -214,14 +216,14 @@ public final class LoaderDispatcher {
content);
// check which caching strategy shall be used
- if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
+ if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
- assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy;
+ assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false));
return response;
@@ -232,7 +234,7 @@ public final class LoaderDispatcher {
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
- if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
+ if (cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
return null;
}
@@ -259,7 +261,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
- if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
+ if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (response != null) {
@@ -302,7 +304,8 @@ public final class LoaderDispatcher {
if (!fetchOnline) return null;
// try to download the resource using the loader
- final Response entry = load(url, forText, reindexing);
+ final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
+ final Response entry = load(url, forText, reindexing, maxFileSize);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@@ -321,7 +324,7 @@ public final class LoaderDispatcher {
* @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link Document}
*/
- public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) {
+ public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
// load resource
byte[] resContent = null;
@@ -336,7 +339,7 @@ public final class LoaderDispatcher {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
- final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global);
+ final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@@ -431,9 +434,10 @@ public final class LoaderDispatcher {
}
}
- public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
+ public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
// load page
- Response r = loader.load(location, true, false, cachePolicy);
+ final long maxFileSize = loader.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
+ Response r = loader.load(location, true, false, cachePolicy, maxFileSize);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
@@ -455,25 +459,27 @@ public final class LoaderDispatcher {
}
}
- public void loadIfNotExistBackground(String url, File cache) {
- new Loader(url, cache).start();
+ public void loadIfNotExistBackground(String url, File cache, long maxFileSize) {
+ new Loader(url, cache, maxFileSize).start();
}
private class Loader extends Thread {
private String url;
private File cache;
+ private long maxFileSize;
- public Loader(String url, File cache) {
+ public Loader(String url, File cache, long maxFileSize) {
this.url = url;
this.cache = cache;
+ this.maxFileSize = maxFileSize;
}
public void run() {
if (this.cache.exists()) return;
try {
// load from the net
- Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+ Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize);
byte[] b = response.getContent();
FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}