- removed download-limit that can be given for the crawler for non-crawler download tasks. This was necessary because the same procedure was used for other downloads like for the download of dictionary files where a limit is not useful. The limit still stays for the indexer

- migrated the opengeodb downloader to a new version of the opengeodb-dump


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6873 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 3661cb692c
commit 2126c03a62

@ -189,7 +189,7 @@ public class Bookmarks {
Document document = null;
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false);
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", metadata.url().toNormalform(false, true));
prop.putHTML("mode_title", metadata.dc_title());

@ -194,11 +194,11 @@ public class Crawler_p {
env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
final String cachePolicyString = post.get("cachePolicy", "iffresh");
int cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CACHE_STRATEGY_CACHEONLY;
CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
final boolean xsstopw = post.get("xsstopw", "off").equals("on");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");

@ -20,37 +20,38 @@
<legend>Geolocalization</legend>
The geolocalization file will enable YaCy to present locations from OpenStreetMap according to given search words.
With this file it is possible to find locations using the location (city) name, a zip code, a car sign or a telephone pre-dial number.
<dl>
<dt><label>Download from</label></dt>
<dd>#[geo0URL]#</dd>
<dd>#[geo1URL]#</dd>
<dt><label>Storage location</label></dt>
<dd>#[geo0Storage]#</dd>
<dd>#[geo1Storage]#</dd>
<dt><label>Status</label></dt>
<dd>#(geo0Status)#<div class="info">not loaded</div>::<div class="commit">loaded</div>::de-activated#(/geo0Status)#</dd>
<dd>#(geo1Status)#<div class="info">not loaded</div>::<div class="commit">loaded</div>::de-activated#(/geo1Status)#</dd>
<dt></dt>
<dd>#(geo0Status)#
<input type="submit" name="geo0Load" value="Load" />::
<input type="submit" name="geo0Deactivate" value="de-Activate" />
<input type="submit" name="geo0Remove" value="Remove" />::
<input type="submit" name="geo0Activate" value="Activate" />
<input type="submit" name="geo0Remove" value="Remove" />
#(/geo0Status)#</dd>
#(geo0ActionLoaded)#::
<dd>#(geo1Status)#
<input type="submit" name="geo1Load" value="Load" />::
<input type="submit" name="geo1Deactivate" value="de-Activate" />
<input type="submit" name="geo1Remove" value="Remove" />::
<input type="submit" name="geo1Activate" value="Activate" />
<input type="submit" name="geo1Remove" value="Remove" />
#(/geo1Status)#</dd>
#(geo1ActionLoaded)#::
<dt></dt><dd><div class="commit">loaded and activated dictionary file</div></dd>::
<dt></dt><dd><div class="error">loading of dictionary file failed: #[error]#</div></dd>
#(/geo0ActionLoaded)#
#(geo0ActionRemoved)#::
#(/geo1ActionLoaded)#
#(geo1ActionRemoved)#::
<dt></dt><dd><div class="commit">de-activated and removed dictionary file</div></dd>::
<dt></dt><dd><div class="error">cannot remove dictionary file: #[error]#</div></dd>
#(/geo0ActionRemoved)#
#(geo0ActionDeactivated)#::
#(/geo1ActionRemoved)#
#(geo1ActionDeactivated)#::
<dt></dt><dd><div class="commit">de-activated dictionary file</div></dd>::
<dt></dt><dd><div class="error">cannot de-activate dictionary file: #[error]#</div></dd>
#(/geo0ActionDeactivated)#
#(geo0ActionActivated)#::
#(/geo1ActionDeactivated)#
#(geo1ActionActivated)#::
<dt></dt><dd><div class="commit">activated dictionary file</div></dd>::
<dt></dt><dd><div class="error">cannot activate dictionary file: #[error]#</div></dd>
#(/geo0ActionActivated)#
#(/geo1ActionActivated)#
</dl>
</fieldset>
</form>

@ -58,45 +58,47 @@ public class DictionaryLoader_p {
if (post == null) return prop;
if (post.containsKey("geo0Load")) {
// GEO1
if (post.containsKey("geo1Load")) {
// load from the net
try {
Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO0.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
Response response = sb.loader.load(new DigestURI(LibraryProvider.Dictionary.GEO1.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEO0.file());
LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file());
prop.put("geo0Status", LibraryProvider.Dictionary.GEO0.file().exists() ? 1 : 0);
prop.put("geo0ActionLoaded", 1);
FileUtils.copy(b, LibraryProvider.Dictionary.GEO1.file());
LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false);
prop.put("geo1Status", LibraryProvider.Dictionary.GEO1.file().exists() ? 1 : 0);
prop.put("geo1ActionLoaded", 1);
} catch (MalformedURLException e) {
Log.logException(e);
prop.put("geo0ActionLoaded", 2);
prop.put("geo0ActionLoaded_error", e.getMessage());
prop.put("geo1ActionLoaded", 2);
prop.put("geo1ActionLoaded_error", e.getMessage());
} catch (IOException e) {
Log.logException(e);
prop.put("geo0ActionLoaded", 2);
prop.put("geo0ActionLoaded_error", e.getMessage());
prop.put("geo1ActionLoaded", 2);
prop.put("geo1ActionLoaded_error", e.getMessage());
}
}
if (post.containsKey("geo0Remove")) {
FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.file());
FileUtils.deletedelete(LibraryProvider.Dictionary.GEO0.fileDisabled());
LibraryProvider.geoDB = new OpenGeoDB(null);
prop.put("geo0ActionRemoved", 1);
if (post.containsKey("geo1Remove")) {
FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.file());
FileUtils.deletedelete(LibraryProvider.Dictionary.GEO1.fileDisabled());
LibraryProvider.geoDB = new OpenGeoDB(null, true);
prop.put("geo1ActionRemoved", 1);
}
if (post.containsKey("geo0Deactivate")) {
LibraryProvider.Dictionary.GEO0.file().renameTo(LibraryProvider.Dictionary.GEO0.fileDisabled());
LibraryProvider.geoDB = new OpenGeoDB(null);
prop.put("geo0ActionDeactivated", 1);
if (post.containsKey("geo1Deactivate")) {
LibraryProvider.Dictionary.GEO1.file().renameTo(LibraryProvider.Dictionary.GEO1.fileDisabled());
LibraryProvider.geoDB = new OpenGeoDB(null, true);
prop.put("geo1ActionDeactivated", 1);
}
if (post.containsKey("geo0Activate")) {
LibraryProvider.Dictionary.GEO0.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO0.file());
LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO0.file());
prop.put("geo0ActionActivated", 1);
if (post.containsKey("geo1Activate")) {
LibraryProvider.Dictionary.GEO1.fileDisabled().renameTo(LibraryProvider.Dictionary.GEO1.file());
LibraryProvider.geoDB = new OpenGeoDB(LibraryProvider.Dictionary.GEO1.file(), false);
prop.put("geo1ActionActivated", 1);
}
// check status again
for (LibraryProvider.Dictionary dictionary: LibraryProvider.Dictionary.values()) {
prop.put(dictionary.nickname + "Status", dictionary.file().exists() ? 1 : dictionary.fileDisabled().exists() ? 2 : 0);

@ -163,7 +163,7 @@ public class QuickCrawlLink_p {
xsstopw,
xdstopw,
xpstopw,
CrawlProfile.CACHE_STRATEGY_IFFRESH
CrawlProfile.CacheStrategy.IFFRESH
);
} catch (final Exception e) {
// mist

@ -63,7 +63,7 @@ public class RSSLoader_p {
// if the resource body was not cached we try to load it from web
Response entry = null;
try {
entry = sb.loader.load(url, true, false);
entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
return prop;
}

@ -182,7 +182,7 @@ public class ViewFile {
// load resource from net
Response response = null;
try {
response = sb.loader.load(url, true, false);
response = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (IOException e) {
Log.logException(e);
}
@ -198,7 +198,7 @@ public class ViewFile {
if (resource == null) {
Response entry = null;
try {
entry = sb.loader.load(url, true, false);
entry = sb.loader.load(url, true, false, Long.MAX_VALUE);
} catch (final Exception e) {
prop.put("error", "4");
prop.putHTML("error_errorText", e.getMessage());
@ -238,7 +238,7 @@ public class ViewFile {
}
try {
Response response = sb.loader.load(url, true, false);
Response response = sb.loader.load(url, true, false, Long.MAX_VALUE);
responseHeader = response.getResponseHeader();
resource = response.getContent();
} catch (IOException e) {

@ -54,11 +54,11 @@ public class getpageinfo_p {
}
ContentScraper scraper = null;
if (u != null) try {
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFFRESH);
} catch (final IOException e) {
// try again, try harder
try {
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFEXIST);
scraper = LoaderDispatcher.parseResource(sb.loader, u, CrawlProfile.CacheStrategy.IFEXIST);
} catch (final IOException ee) {
// now thats a fail, do nothing
}

@ -430,7 +430,7 @@ public class yacysearch {
if (urlentry != null) {
final URIMetadataRow.Components metadata = urlentry.metadata();
Document document;
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false);
document = LoaderDispatcher.retrieveDocument(metadata.url(), true, 5000, true, false, Long.MAX_VALUE);
if (document != null) {
// create a news message
final HashMap<String, String> map = new HashMap<String, String>();

@ -385,8 +385,8 @@ public class Balancer {
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (
profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_CACHEONLY ||
(profileEntry.cacheStrategy() == CrawlProfile.CACHE_STRATEGY_IFEXIST && Cache.has(crawlEntry.url()))
profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CrawlProfile.CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), minimumLocalDelta, minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + new String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + new String(rowEntry.getPrimaryKeyBytes());

@ -170,7 +170,7 @@ public class CrawlProfile {
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final int cacheStrategy) {
final CacheStrategy cacheStrategy) {
final entry ne = new entry(
name, startURL,
@ -246,10 +246,23 @@ public class CrawlProfile {
}
public final static int CACHE_STRATEGY_NOCACHE = 0; // never use the cache, all content from fresh internet source
public final static int CACHE_STRATEGY_IFFRESH = 1; // use the cache if the cache exists and is fresh using the proxy-fresh rules
public final static int CACHE_STRATEGY_IFEXIST = 2; // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
public final static int CACHE_STRATEGY_CACHEONLY = 3; // never go online, use all content from cache. If no cache exist, treat content as unavailable
public static enum CacheStrategy {
NOCACHE(0), // never use the cache, all content from fresh internet source
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache exist, treat content as unavailable
public int code;
private CacheStrategy(int code) {
this.code = code;
}
public String toString() {
return Integer.toString(this.code);
}
public static CacheStrategy decode(int code) {
for (CacheStrategy strategy: CacheStrategy.values()) if (strategy.code == code) return strategy;
return NOCACHE;
}
}
public static class entry {
// this is a simple record structure that hold all properties of a single crawl start
@ -290,7 +303,7 @@ public class CrawlProfile {
final boolean storeHTCache, final boolean storeTXCache,
final boolean remoteIndexing,
final boolean xsstopw, final boolean xdstopw, final boolean xpstopw,
final int cacheStrategy) {
final CacheStrategy cacheStrategy) {
if (name == null || name.length() == 0) throw new NullPointerException("name must not be null");
final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(Long.toString(System.currentTimeMillis()))).substring(0, Word.commonHashLength) : new String(startURL.hash());
mem = new HashMap<String, String>(40);
@ -312,7 +325,7 @@ public class CrawlProfile {
mem.put(XSSTOPW, Boolean.toString(xsstopw)); // exclude static stop-words
mem.put(XDSTOPW, Boolean.toString(xdstopw)); // exclude dynamic stop-word
mem.put(XPSTOPW, Boolean.toString(xpstopw)); // exclude parent stop-words
mem.put(CACHE_STRAGEGY, Integer.toString(cacheStrategy));
mem.put(CACHE_STRAGEGY, cacheStrategy.toString());
doms = new ConcurrentHashMap<String, DomProfile>();
}
@ -376,14 +389,14 @@ public class CrawlProfile {
return 0;
}
}
public int cacheStrategy() {
public CacheStrategy cacheStrategy() {
final String r = mem.get(CACHE_STRAGEGY);
if (r == null) return CACHE_STRATEGY_IFFRESH;
if (r == null) return CacheStrategy.IFFRESH;
try {
return Integer.parseInt(r);
return CacheStrategy.decode(Integer.parseInt(r));
} catch (final NumberFormatException e) {
Log.logException(e);
return CACHE_STRATEGY_IFFRESH;
return CacheStrategy.IFFRESH;
}
}
public long recrawlIfOlder() {

@ -45,6 +45,7 @@ import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.workflow.WorkflowJob;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.http.client.Client;
@ -561,7 +562,8 @@ public class CrawlQueues {
// returns null if everything went fine, a fail reason string if a problem occurred
try {
request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
Response response = sb.loader.load(request, true);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response response = sb.loader.load(request, true, maxFileSize);
if (response == null) {
request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (log.isFine()) log.logFine("problem loading " + request.url().toString() + ": no content (possibly caused by cache policy)");

@ -168,37 +168,37 @@ public final class CrawlSwitchboard {
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.CACHEONLY);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE);
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
}
}

@ -45,7 +45,7 @@ public final class HTTPLoader {
private static final String DEFAULT_ENCODING = "gzip,deflate";
private static final String DEFAULT_LANGUAGE = "en-us,en;q=0.5";
private static final String DEFAULT_CHARSET = "ISO-8859-1,utf-8;q=0.7,*;q=0.7";
private static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
public static final long DEFAULT_MAXFILESIZE = 1024 * 1024 * 10;
public static final int DEFAULT_CRAWLING_RETRY_COUNT = 5;
public static final String crawlerUserAgent = "yacybot (" + Client.getSystemOST() +") http://yacy.net/bot.html";
public static final String yacyUserAgent = "yacy (" + Client.getSystemOST() +") yacy.net";
@ -74,14 +74,14 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 10000);
}
public Response load(final Request entry, final boolean acceptOnlyParseable) throws IOException {
public Response load(final Request entry, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
long start = System.currentTimeMillis();
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT);
Response doc = load(entry, acceptOnlyParseable, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize);
Latency.update(new String(entry.url().hash()).substring(6), entry.url().getHost(), System.currentTimeMillis() - start);
return doc;
}
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount) throws IOException {
private Response load(final Request request, boolean acceptOnlyParseable, final int retryCount, final long maxFileSize) throws IOException {
if (retryCount < 0) {
sb.crawlQueues.errorURL.push(request, sb.peers.mySeed().hash.getBytes(), new Date(), 1, "redirection counter exceeded");
@ -113,8 +113,7 @@ public final class HTTPLoader {
// take a file from the net
Response response = null;
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", DEFAULT_MAXFILESIZE);
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
@ -202,7 +201,7 @@ public final class HTTPLoader {
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, acceptOnlyParseable, retryCount - 1);
return load(request, acceptOnlyParseable, retryCount - 1, maxFileSize);
}
} else {
// if the response has not the right response type then reject file

@ -50,14 +50,17 @@ public class LibraryProvider {
public static final String disabledExtension = ".disabled";
public static DidYouMeanLibrary dymLib = new DidYouMeanLibrary(null);
public static OpenGeoDB geoDB = new OpenGeoDB(null);
public static OpenGeoDB geoDB = new OpenGeoDB(null, true);
private static File dictSource = null;
private static File dictRoot = null;
public static enum Dictionary {
GEO0("geo0",
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz",
"opengeodb-0.2.5a-UTF8-sql.gz");
"opengeodb-0.2.5a-UTF8-sql.gz"),
GEO1("geo1",
"http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02621_2010-03-16.sql.gz",
"opengeodb-02621_2010-03-16.sql.gz");
public String nickname, url, filename;
private Dictionary(String nickname, String url, String filename) {
@ -95,20 +98,16 @@ public class LibraryProvider {
}
public static void integrateOpenGeoDB() {
File ogdb = new File(dictSource, "opengeodb-0.2.5a-UTF8-sql.gz");
if (ogdb.exists()) {
geoDB = new OpenGeoDB(ogdb);
return;
File geo1 = Dictionary.GEO1.file();
File geo0 = Dictionary.GEO0.file();
if (geo1.exists()) {
if (geo0.exists()) geo0.renameTo(Dictionary.GEO0.fileDisabled());
geoDB = new OpenGeoDB(geo1, false);
return;
}
ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql.gz");
if (ogdb.exists()) {
geoDB = new OpenGeoDB(ogdb);
return;
}
ogdb = new File(dictSource, "opengeodb-02513_2007-10-02.sql");
if (ogdb.exists()) {
geoDB = new OpenGeoDB(ogdb);
return;
if (geo0.exists()) {
geoDB = new OpenGeoDB(geo0, true);
return;
}
}

@ -334,6 +334,6 @@ public class SitemapParser extends DefaultHandler {
false,
// exclude stop-words
true, true, true,
CrawlProfile.CACHE_STRATEGY_IFFRESH);
CrawlProfile.CacheStrategy.IFFRESH);
}
}

@ -161,7 +161,7 @@ public class bookmarksDB {
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
Boolean.parseBoolean(parser[12]), CrawlProfile.CACHE_STRATEGY_IFFRESH
Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.IFFRESH
);
}
if (parser.length == 14) {
@ -169,7 +169,7 @@ public class bookmarksDB {
Integer.parseInt(parser[5]), Integer.parseInt(parser[6]), Boolean.parseBoolean(parser[7]),
Boolean.parseBoolean(parser[8]), Boolean.parseBoolean(parser[9]),
Boolean.parseBoolean(parser[10]), Boolean.parseBoolean(parser[11]),
Boolean.parseBoolean(parser[12]), Integer.parseInt(parser[13])
Boolean.parseBoolean(parser[12]), CrawlProfile.CacheStrategy.decode(Integer.parseInt(parser[13]))
);
}
}
@ -206,7 +206,7 @@ public class bookmarksDB {
public void folderReCrawl(long schedule, String folder, String crawlingfilter, int newcrawlingdepth, long crawlingIfOlder,
int crawlingDomFilterDepth, int crawlingDomMaxPages, boolean crawlingQ, boolean indexText, boolean indexMedia,
boolean crawlOrder, boolean xsstopw, boolean storeHTCache, int cacheStrategy) {
boolean crawlOrder, boolean xsstopw, boolean storeHTCache, CrawlProfile.CacheStrategy cacheStrategy) {
final Switchboard sb = Switchboard.getSwitchboard();
final Iterator<String> bit = getBookmarksIterator(folder, true);

@ -52,14 +52,16 @@ public class ClientGetMethod extends GetMethod {
protected void readResponseHeaders(HttpState state, HttpConnection conn) throws IOException, HttpException {
super.readResponseHeaders(state, conn);
// already processing the header to be able to throw an exception
Header contentlengthHeader = getResponseHeader("content-length");
long contentlength = 0;
if (contentlengthHeader != null) {
try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { }
}
if (contentlength > maxfilesize) {
throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize);
if (this.maxfilesize < Long.MAX_VALUE) {
// already processing the header to be able to throw an exception
Header contentlengthHeader = getResponseHeader("content-length");
long contentlength = 0;
if (contentlengthHeader != null) {
try { contentlength = Long.parseLong(contentlengthHeader.getValue()); } catch (NumberFormatException e) { }
}
if (contentlength > maxfilesize) {
throw new IOException("Content-Length " + contentlength + " larger than maxfilesize " + maxfilesize);
}
}
}
}

@ -116,7 +116,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return new ArrayList<MediaSnippet>();
}
final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing);
final Document document = LoaderDispatcher.retrieveDocument(url, fetchOnline, timeout, false, reindexing, Long.MAX_VALUE);
final ArrayList<MediaSnippet> a = new ArrayList<MediaSnippet>();
if (document != null) {
if ((mediatype == ContentDomain.ALL) || (mediatype == ContentDomain.AUDIO)) a.addAll(computeMediaSnippets(document, queryhashes, ContentDomain.AUDIO));

@ -359,7 +359,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = loader.load(url, true, reindexing);
final Response entry = loader.load(url, true, reindexing, Long.MAX_VALUE);
// get resource metadata (e.g. the http headers for http resources)
if (entry != null) {

@ -90,7 +90,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible
Response entry = null;
try {
entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
entry = Switchboard.getSwitchboard().loader.load(tileURL, false, false, CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE);
} catch (IOException e) {
Log.logWarning("yamyOSM", "cannot load: " + e.getMessage());
return null;

@ -235,7 +235,7 @@ public final class yacyRelease extends yacyVersion {
// returns the version info if successful, null otherwise
ContentScraper scraper;
try {
scraper = LoaderDispatcher.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
scraper = LoaderDispatcher.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CacheStrategy.NOCACHE);
} catch (final IOException e) {
return null;
}

@ -76,7 +76,7 @@ public class OpenGeoDB {
private final HashMap<String, List<Integer>> predial2ids;
private final HashMap<String, Integer> zip2id;
public OpenGeoDB(final File file) {
public OpenGeoDB(final File file, boolean lonlat) {
this.locTypeHash2locType = new HashMap<Integer, String>();
this.id2loc = new HashMap<Integer, Location>();
@ -98,6 +98,7 @@ public class OpenGeoDB {
String[] v;
Integer id;
String h;
double lon, lat;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (!line.startsWith("INSERT INTO ")) continue;
@ -107,7 +108,14 @@ public class OpenGeoDB {
if (line.startsWith("geodb_coordinates ")) {
line = line.substring(18 + 7);v = line.split(",");
v = line.split(",");
id2loc.put(Integer.parseInt(v[0]), new Location(Double.parseDouble(v[2]), Double.parseDouble(v[3])));
if (lonlat) {
lon = Double.parseDouble(v[2]);
lat = Double.parseDouble(v[3]);
} else {
lat = Double.parseDouble(v[2]);
lon = Double.parseDouble(v[3]);
}
id2loc.put(Integer.parseInt(v[0]), new Location(lon, lat));
}
if (line.startsWith("geodb_textdata ")) {
line = line.substring(15 + 7);

@ -58,7 +58,7 @@ public class OAIListFriendsLoader {
public static void init(LoaderDispatcher loader, Map<String, File> moreFriends) {
listFriends.putAll(moreFriends);
if (loader != null) for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue());
loader.loadIfNotExistBackground(oaiFriend.getKey(), oaiFriend.getValue(), Long.MAX_VALUE);
}
}
@ -81,7 +81,7 @@ public class OAIListFriendsLoader {
Map<String, String> m;
for (Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) {
Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
Response response = loader == null ? null : loader.load(new DigestURI(oaiFriend.getKey(), null), false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
}

@ -48,7 +48,7 @@ public class OAIPMHLoader {
this.source = source;
// load the file from the net
Response response = loader.load(source, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
Response response = loader.load(source, false, true, CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(source, b);
//System.out.println("*** ResumptionToken = " + this.resumptionToken.toString());

@ -99,8 +99,9 @@ public final class LoaderDispatcher {
public Response load(
final DigestURI url,
final boolean forText,
final boolean global) throws IOException {
return load(request(url, forText, global), forText);
final boolean global,
final long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, maxFileSize);
}
/**
@ -116,13 +117,14 @@ public final class LoaderDispatcher {
final DigestURI url,
final boolean forText,
final boolean global,
int cacheStratgy) throws IOException {
return load(request(url, forText, global), forText, cacheStratgy);
CrawlProfile.CacheStrategy cacheStratgy,
long maxFileSize) throws IOException {
return load(request(url, forText, global), forText, cacheStratgy, maxFileSize);
}
public void load(final DigestURI url, int cacheStratgy, File targetFile) throws IOException {
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
byte[] b = load(request(url, false, true), false, cacheStratgy).getContent();
byte[] b = load(request(url, false, true), false, cacheStratgy, maxFileSize).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -164,14 +166,14 @@ public final class LoaderDispatcher {
0);
}
public Response load(final Request request, final boolean acceptOnlyParseable) throws IOException {
public Response load(final Request request, final boolean acceptOnlyParseable, long maxFileSize) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFEXIST;
CrawlProfile.CacheStrategy cacheStrategy = CrawlProfile.CacheStrategy.IFEXIST;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, acceptOnlyParseable, cacheStrategy);
return load(request, acceptOnlyParseable, cacheStrategy, maxFileSize);
}
public Response load(final Request request, final boolean acceptOnlyParseable, int cacheStrategy) throws IOException {
public Response load(final Request request, final boolean acceptOnlyParseable, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
@ -183,7 +185,7 @@ public final class LoaderDispatcher {
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
@ -214,14 +216,14 @@ public final class LoaderDispatcher {
content);
// check which caching strategy shall be used
if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFEXIST || cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + request.url().toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CrawlProfile.CACHE_STRATEGY_IFFRESH : "cacheStrategy = " + cacheStrategy;
assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + request.url().toNormalform(true, false));
return response;
@ -232,7 +234,7 @@ public final class LoaderDispatcher {
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
if (cacheStrategy == CrawlProfile.CACHE_STRATEGY_CACHEONLY) {
if (cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
return null;
}
@ -259,7 +261,7 @@ public final class LoaderDispatcher {
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable);
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, acceptOnlyParseable, maxFileSize);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (response != null) {
@ -302,7 +304,8 @@ public final class LoaderDispatcher {
if (!fetchOnline) return null;
// try to download the resource using the loader
final Response entry = load(url, forText, reindexing);
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(url, forText, reindexing, maxFileSize);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
@ -321,7 +324,7 @@ public final class LoaderDispatcher {
* @param global the domain of the search. If global == true then the content is re-indexed
* @return the parsed document as {@link Document}
*/
public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global) {
public static Document retrieveDocument(final DigestURI url, final boolean fetchOnline, final int timeout, final boolean forText, final boolean global, long maxFileSize) {
// load resource
byte[] resContent = null;
@ -336,7 +339,7 @@ public final class LoaderDispatcher {
// if not found try to download it
// download resource using the crawler and keep resource in memory if possible
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global);
final Response entry = Switchboard.getSwitchboard().loader.load(url, forText, global, maxFileSize);
// getting resource metadata (e.g. the http headers for http resources)
if (entry != null) {
@ -431,9 +434,10 @@ public final class LoaderDispatcher {
}
}
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, int cachePolicy) throws IOException {
public static ContentScraper parseResource(final LoaderDispatcher loader, final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
// load page
Response r = loader.load(location, true, false, cachePolicy);
final long maxFileSize = loader.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response r = loader.load(location, true, false, cachePolicy, maxFileSize);
byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
@ -455,25 +459,27 @@ public final class LoaderDispatcher {
}
}
public void loadIfNotExistBackground(String url, File cache) {
new Loader(url, cache).start();
public void loadIfNotExistBackground(String url, File cache, long maxFileSize) {
new Loader(url, cache, maxFileSize).start();
}
private class Loader extends Thread {
private String url;
private File cache;
private long maxFileSize;
public Loader(String url, File cache) {
public Loader(String url, File cache, long maxFileSize) {
this.url = url;
this.cache = cache;
this.maxFileSize = maxFileSize;
}
public void run() {
if (this.cache.exists()) return;
try {
// load from the net
Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
Response response = load(new DigestURI(this.url), false, true, CrawlProfile.CacheStrategy.NOCACHE, this.maxFileSize);
byte[] b = response.getContent();
FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}

Loading…
Cancel
Save