reduced locking situation in crawler: shifted synchronized location and

reduced time-out of robots.txt load limit
pull/1/head
Michael Peter Christen 12 years ago
parent f93501e6e0
commit 8f2d3ce2f9

@ -41,6 +41,7 @@ import java.util.Set;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.data.BookmarkHelper; import net.yacy.data.BookmarkHelper;
@ -201,7 +202,7 @@ public class Bookmarks {
// try to get the bookmark from the LURL database // try to get the bookmark from the LURL database
final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash)); final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash));
if (urlentry != null) try { if (urlentry != null) try {
final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay)); final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
prop.put("mode_edit", "0"); // create mode prop.put("mode_edit", "0"); // create mode
prop.put("mode_url", urlentry.url().toNormalform(false)); prop.put("mode_url", urlentry.url().toNormalform(false));
prop.putHTML("mode_title", urlentry.dc_title()); prop.putHTML("mode_title", urlentry.dc_title());

@ -25,6 +25,7 @@ import java.util.Set;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
@ -99,7 +100,7 @@ public class CrawlCheck_p {
// try to load the url // try to load the url
if (robotsAllowed) try { if (robotsAllowed) try {
Request request = sb.loader.request(u, true, false); Request request = sb.loader.request(u, true, false);
final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) { if (response == null) {
prop.put("table_list_" + row + "_access", "no response"); prop.put("table_list_" + row + "_access", "no response");
} else { } else {

@ -37,6 +37,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlProfile;
@ -280,7 +281,7 @@ public class Crawler_p {
// download document // download document
Document scraper; Document scraper;
try { try {
scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
// get links and generate filter // get links and generate filter
for (DigestURI u: scraper.getAnchors().keySet()) { for (DigestURI u: scraper.getAnchors().keySet()) {
newRootURLs.add(u); newRootURLs.add(u);

@ -24,6 +24,7 @@ import java.net.MalformedURLException;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation; import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
@ -66,7 +67,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon0Load")) { if (post.containsKey("geon0Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1)); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1));
@ -108,7 +109,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon1Load")) { if (post.containsKey("geon1Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file()); FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1)); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1));
@ -150,7 +151,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geon2Load")) { if (post.containsKey("geon2Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file()); FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file());
LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000)); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000));
@ -192,7 +193,7 @@ public class DictionaryLoader_p {
if (post.containsKey("geo1Load")) { if (post.containsKey("geo1Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file());
LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname); LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname);
@ -235,7 +236,7 @@ public class DictionaryLoader_p {
if (post.containsKey("drw0Load")) { if (post.containsKey("drw0Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file());
LibraryProvider.activateDeReWo(); LibraryProvider.activateDeReWo();
@ -279,7 +280,7 @@ public class DictionaryLoader_p {
if (post.containsKey("pnd0Load")) { if (post.containsKey("pnd0Load")) {
// load from the net // load from the net
try { try {
final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file()); FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file());
LibraryProvider.activatePND(); LibraryProvider.activatePND();

@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
@ -266,7 +267,7 @@ public class Load_RSS_p {
RSSReader rss = null; RSSReader rss = null;
if (url != null) try { if (url != null) try {
prop.put("url", url.toNormalform(true)); prop.put("url", url.toNormalform(true));
final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = response == null ? null : response.getContent(); final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final IOException e) { } catch (final IOException e) {

@ -42,6 +42,7 @@ import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.YaCyMetadata; import net.yacy.cora.lod.vocabulary.YaCyMetadata;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.Cache;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
@ -164,7 +165,7 @@ public class ViewFile {
Response response = null; Response response = null;
try { try {
response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
prop.put("error", "4"); prop.put("error", "4");
prop.put("error_errorText", "error loading resource: " + e.getMessage()); prop.put("error_errorText", "error loading resource: " + e.getMessage());

@ -33,6 +33,7 @@ import java.net.MalformedURLException;
import java.util.Map; import java.util.Map;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -104,7 +105,7 @@ public class ViewImage {
if (image == null) { if (image == null) {
byte[] resourceb = null; byte[] resourceb = null;
if (url != null) try { if (url != null) try {
resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay); resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
Log.logFine("ViewImage", "cannot load: " + e.getMessage()); Log.logFine("ViewImage", "cannot load: " + e.getMessage());
} }

@ -34,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.crawler.robots.RobotsTxtEntry;
@ -96,7 +97,7 @@ public class getpageinfo {
} }
net.yacy.document.Document scraper = null; net.yacy.document.Document scraper = null;
if (u != null) try { if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior" // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -34,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.crawler.robots.RobotsTxtEntry;
@ -96,7 +97,7 @@ public class getpageinfo_p {
} }
net.yacy.document.Document scraper = null; net.yacy.document.Document scraper = null;
if (u != null) try { if (u != null) try {
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior" // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"

@ -32,6 +32,7 @@ import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.data.CrawlQueues;
import net.yacy.kelondro.data.citation.CitationReference; import net.yacy.kelondro.data.citation.CitationReference;
@ -97,7 +98,7 @@ public class webstructure {
prop.put("references", 1); prop.put("references", 1);
net.yacy.document.Document scraper = null; net.yacy.document.Document scraper = null;
if (url != null) try { if (url != null) try {
scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay); scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} }

@ -49,6 +49,7 @@ import net.yacy.cora.federate.opensearch.OpenSearchConnector;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.GeoLocation;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -569,7 +570,7 @@ public class yacysearch {
sb.loader.loadDocuments( sb.loader.loadDocuments(
sb.loader.request(urlentry.url(), true, false), sb.loader.request(urlentry.url(), true, false),
CacheStrategy.IFEXIST, CacheStrategy.IFEXIST,
Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch ( final IOException e ) { } catch ( final IOException e ) {
} catch ( final Parser.Failure e ) { } catch ( final Parser.Failure e ) {
} }

@ -32,6 +32,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
@ -187,7 +188,7 @@ public class yacysearchitem {
// END interaction // END interaction
prop.putHTML("content_target", target); prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString); prop.put("content_urlhash", resulthashString);
prop.put("content_ranking", result.ranking()); prop.put("content_ranking", result.ranking());
@ -271,7 +272,7 @@ public class yacysearchitem {
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = URLLicense.aquireLicense(ms.url()); final String license = URLLicense.aquireLicense(ms.url());
sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring); prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
prop.putHTML("content_item_href", resultUrlstring); prop.putHTML("content_item_href", resultUrlstring);
prop.putHTML("content_item_target", target); prop.putHTML("content_item_target", target);

@ -421,40 +421,40 @@ public class Balancer {
long sleeptime = 0; long sleeptime = 0;
Request crawlEntry = null; Request crawlEntry = null;
CrawlProfile profileEntry = null; CrawlProfile profileEntry = null;
synchronized (this) { byte[] failhash = null;
byte[] failhash = null; while (!this.urlFileIndex.isEmpty()) {
while (!this.urlFileIndex.isEmpty()) { byte[] nexthash = getbest(robots);
byte[] nexthash = getbest(robots); synchronized (this) {
if (nexthash == null) return null; if (nexthash == null) return null;
Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash); Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash);
if (rowEntry == null) continue; if (rowEntry == null) continue;
crawlEntry = new Request(rowEntry); crawlEntry = new Request(rowEntry);
//Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false)); //Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false));
// check blacklist (again) because the user may have created blacklist entries after the queue has been filled // check blacklist (again) because the user may have created blacklist entries after the queue has been filled
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
Log.logFine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist."); Log.logFine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist.");
continue; continue;
} }
// at this point we must check if the crawlEntry has relevance because the crawl profile still exists // at this point we must check if the crawlEntry has relevance because the crawl profile still exists
// if not: return null. A calling method must handle the null value and try again // if not: return null. A calling method must handle the null value and try again
profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
if (profileEntry == null) { if (profileEntry == null) {
Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle());
continue; continue;
} }
// depending on the caching policy we need sleep time to avoid DoS-like situations // depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url()); sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url());
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());
if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops
break; break;
} }
} }
if (crawlEntry == null) return null; if (crawlEntry == null) return null;

@ -41,6 +41,7 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.NoticedURL.StackType; import net.yacy.crawler.data.NoticedURL.StackType;
@ -653,7 +654,7 @@ public class CrawlQueues {
try { try {
this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING);
final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle()));
final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay); final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) { if (response == null) {
this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); this.request.setStatus("error", WorkflowJob.STATUS_FINISHED);
if (CrawlQueues.this.log.isFine()) { if (CrawlQueues.this.log.isFine()) {

@ -70,15 +70,15 @@ public final class HTTPLoader {
this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000); this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000);
} }
public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException { public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
Latency.updateBeforeLoad(entry.url()); Latency.updateBeforeLoad(entry.url());
final long start = System.currentTimeMillis(); final long start = System.currentTimeMillis();
final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType); final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout);
Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start);
return doc; return doc;
} }
private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType) throws IOException { private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException {
byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash); byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash);
@ -127,7 +127,7 @@ public final class HTTPLoader {
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING));
// HTTP-Client // HTTP-Client
final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT); final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), timeout);
client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice
client.setTimout(this.socketTimeout); client.setTimout(this.socketTimeout);
client.setHeader(requestHeader.entrySet()); client.setHeader(requestHeader.entrySet());
@ -180,7 +180,7 @@ public final class HTTPLoader {
// retry crawling with new url // retry crawling with new url
request.redirectURL(redirectionUrl); request.redirectURL(redirectionUrl);
return load(request, retryCount - 1, maxFileSize, blacklistType); return load(request, retryCount - 1, maxFileSize, blacklistType, timeout);
} }
// we don't want to follow redirects // we don't want to follow redirects
this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode);

@ -39,6 +39,7 @@ import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ComparableARC; import net.yacy.cora.storage.ComparableARC;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
@ -70,7 +71,7 @@ public class RSSLoader extends Thread {
public void run() { public void run() {
RSSReader rss = null; RSSReader rss = null;
try { try {
final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = response == null ? null : response.getContent(); final byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {

@ -165,7 +165,7 @@ public class RobotsTxt {
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null); Request request = new Request(robotsURL, null);
try { try {
response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0); response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000);
} catch (Throwable e) { } catch (Throwable e) {
log.info("Trying to download the robots.txt file from URL '" + robotsURL + "' failed - " + e.getMessage()); log.info("Trying to download the robots.txt file from URL '" + robotsURL + "' failed - " + e.getMessage());
response = null; response = null;
@ -221,7 +221,7 @@ public class RobotsTxt {
if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'.");
Request request = new Request(robotsURL, null); Request request = new Request(robotsURL, null);
try { try {
response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0); response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000);
} catch (IOException e) { } catch (IOException e) {
response = null; response = null;
} }

@ -11,6 +11,7 @@ import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
@ -67,7 +68,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url); Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url);
return null; return null;
} }
response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
try { try {
return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} catch (final Failure e) { } catch (final Failure e) {

@ -33,6 +33,7 @@ import java.util.EnumMap;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser.Failure; import net.yacy.document.Parser.Failure;
@ -98,7 +99,7 @@ public class YMarkMetadata {
public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure { public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
if(this.document == null) { if(this.document == null) {
Response response = null; Response response = null;
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} }
return this.document; return this.document;

@ -41,6 +41,7 @@ import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -63,7 +64,7 @@ public class OAIListFriendsLoader implements Serializable {
listFriends.putAll(moreFriends); listFriends.putAll(moreFriends);
if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) { if (loader != null) for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) {
try { try {
loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final MalformedURLException e) { } catch (final MalformedURLException e) {
} }
} }
@ -88,7 +89,7 @@ public class OAIListFriendsLoader implements Serializable {
Map<String, String> m; Map<String, String> m;
for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try { for (final Map.Entry<String, File> oaiFriend: listFriends.entrySet()) try {
if (!oaiFriend.getValue().exists()) { if (!oaiFriend.getValue().exists()) {
final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue());
} }

@ -26,6 +26,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
@ -55,7 +56,7 @@ public class OAIPMHLoader {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
// make some retries if first attempt fails // make some retries if first attempt fails
try { try {
response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
break; break;
} catch (IOException e) { } catch (IOException e) {
Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true)); Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true));

@ -35,6 +35,7 @@ import java.util.List;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.Cache;
import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.retrieval.Response;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
@ -113,7 +114,7 @@ public class OSMTile {
// download resource using the crawler and keep resource in memory if possible // download resource using the crawler and keep resource in memory if possible
Response entry = null; Response entry = null;
try { try {
entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
Log.logWarning("OSMTile", "cannot load: " + e.getMessage()); Log.logWarning("OSMTile", "cannot load: " + e.getMessage());
return null; return null;

@ -239,7 +239,7 @@ public final class yacyRelease extends yacyVersion {
try { try {
final DigestURI uri = location.getLocationURL(); final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay); scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (final IOException e) { } catch (final IOException e) {
return null; return null;
} }

@ -132,9 +132,9 @@ public final class LoaderDispatcher {
0); 0);
} }
public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay) throws IOException { public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay).getContent(); final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay, timeout).getContent();
if (b == null) throw new IOException("load == null"); if (b == null) throw new IOException("load == null");
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
@ -145,11 +145,11 @@ public final class LoaderDispatcher {
tmp.renameTo(targetFile); tmp.renameTo(targetFile);
} }
public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay) throws IOException { public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay); return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay, timeout);
} }
public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
Semaphore check = this.loaderSteering.get(request.url()); Semaphore check = this.loaderSteering.get(request.url());
if (check != null) { if (check != null) {
// a loading process may be going on for that url // a loading process may be going on for that url
@ -160,7 +160,7 @@ public final class LoaderDispatcher {
this.loaderSteering.put(request.url(), new Semaphore(0)); this.loaderSteering.put(request.url(), new Semaphore(0));
try { try {
final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay); final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout);
check = this.loaderSteering.remove(request.url()); check = this.loaderSteering.remove(request.url());
if (check != null) check.release(1000); if (check != null) check.release(1000);
return response; return response;
@ -180,7 +180,7 @@ public final class LoaderDispatcher {
* @return the loaded entity in a Response object * @return the loaded entity in a Response object
* @throws IOException * @throws IOException
*/ */
private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
// get the protocol of the next URL // get the protocol of the next URL
final DigestURI url = request.url(); final DigestURI url = request.url();
if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system
@ -280,7 +280,7 @@ public final class LoaderDispatcher {
// load resource from the internet // load resource from the internet
Response response = null; Response response = null;
if (protocol.equals("http") || protocol.equals("https")) { if (protocol.equals("http") || protocol.equals("https")) {
response = this.httpLoader.load(request, maxFileSize, blacklistType); response = this.httpLoader.load(request, maxFileSize, blacklistType, timeout);
} else if (protocol.equals("ftp")) { } else if (protocol.equals("ftp")) {
response = this.ftpLoader.load(request, true); response = this.ftpLoader.load(request, true);
} else if (protocol.equals("smb")) { } else if (protocol.equals("smb")) {
@ -335,19 +335,19 @@ public final class LoaderDispatcher {
* @return the content as {@link byte[]} * @return the content as {@link byte[]}
* @throws IOException * @throws IOException
*/ */
public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
// try to download the resource using the loader // try to download the resource using the loader
final Response entry = load(request, cacheStrategy, blacklistType, minDelay); final Response entry = load(request, cacheStrategy, blacklistType, minDelay, timeout);
if (entry == null) return null; // not found in web if (entry == null) return null; // not found in web
// read resource body (if it is there) // read resource body (if it is there)
return entry.getContent(); return entry.getContent();
} }
public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay) throws IOException, Parser.Failure { public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException, Parser.Failure {
// load resource // load resource
final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay); final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout);
final DigestURI url = request.url(); final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url); if (response == null) throw new IOException("no Response for url " + url);
@ -358,10 +358,10 @@ public final class LoaderDispatcher {
return response.parse(); return response.parse();
} }
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay) throws IOException { public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
// load resource // load resource
Request request = request(location, true, false); Request request = request(location, true, false);
final Response response = this.load(request, cachePolicy, blacklistType, minDelay); final Response response = this.load(request, cachePolicy, blacklistType, minDelay, timeout);
final DigestURI url = request.url(); final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url); if (response == null) throw new IOException("no Response for url " + url);
@ -384,8 +384,8 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls * @return a map from URLs to the anchor texts of the urls
* @throws IOException * @throws IOException
*/ */
public final Map<DigestURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { public final Map<DigestURI, String> loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay); final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay, timeout);
if (response == null) throw new IOException("response == null"); if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader(); final ResponseHeader responseHeader = response.getResponseHeader();
if (response.getContent() == null) throw new IOException("resource == null"); if (response.getContent() == null) throw new IOException("resource == null");
@ -414,12 +414,12 @@ public final class LoaderDispatcher {
} }
} }
public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay, final int timeout) {
new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start();
} }
public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) {
new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start();
} }
private class Loader extends Thread { private class Loader extends Thread {
@ -430,14 +430,16 @@ public final class LoaderDispatcher {
private final CacheStrategy cacheStrategy; private final CacheStrategy cacheStrategy;
private final BlacklistType blacklistType; private final BlacklistType blacklistType;
private final long minDelay; private final long minDelay;
private final int timeout;
public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) { public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, final int timeout) {
this.url = url; this.url = url;
this.cache = cache; this.cache = cache;
this.maxFileSize = maxFileSize; this.maxFileSize = maxFileSize;
this.cacheStrategy = cacheStrategy; this.cacheStrategy = cacheStrategy;
this.blacklistType = blacklistType; this.blacklistType = blacklistType;
this.minDelay = minDelay; this.minDelay = minDelay;
this.timeout = timeout;
} }
@Override @Override
@ -445,7 +447,7 @@ public final class LoaderDispatcher {
if (this.cache != null && this.cache.exists()) return; if (this.cache != null && this.cache.exists()) return;
try { try {
// load from the net // load from the net
final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay); final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay, this.timeout);
final byte[] b = response.getContent(); final byte[] b = response.getContent();
if (this.cache != null) FileUtils.copy(b, this.cache); if (this.cache != null) FileUtils.copy(b, this.cache);
} catch (final MalformedURLException e) {} catch (final IOException e) {} } catch (final MalformedURLException e) {} catch (final IOException e) {}

@ -2878,7 +2878,7 @@ public final class Switchboard extends serverSwitch {
// get a scraper to get the title // get a scraper to get the title
Document scraper; Document scraper;
try { try {
scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
} catch (IOException e) { } catch (IOException e) {
return "scraper cannot load URL: " + e.getMessage(); return "scraper cannot load URL: " + e.getMessage();
} }
@ -2985,7 +2985,7 @@ public final class Switchboard extends serverSwitch {
String urlName = url.toNormalform(true); String urlName = url.toNormalform(true);
Thread.currentThread().setName("Switchboard.addToIndex:" + urlName); Thread.currentThread().setName("Switchboard.addToIndex:" + urlName);
try { try {
final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
if (response == null) { if (response == null) {
throw new IOException("response == null"); throw new IOException("response == null");
} }
@ -3372,7 +3372,7 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURI, String> links; final Map<DigestURI, String> links;
searchEvent.oneFeederStarted(); searchEvent.oneFeederStarted();
try { try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000);
if ( links != null ) { if ( links != null ) {
final Iterator<DigestURI> i = links.keySet().iterator(); final Iterator<DigestURI> i = links.keySet().iterator();
while ( i.hasNext() ) { while ( i.hasNext() ) {
@ -3411,7 +3411,7 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURI, String> links; final Map<DigestURI, String> links;
DigestURI url; DigestURI url;
try { try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000);
if (links != null) { if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<DigestURI> i = links.keySet().iterator(); final Iterator<DigestURI> i = links.keySet().iterator();
@ -3476,7 +3476,7 @@ public final class Switchboard extends serverSwitch {
searchEvent.oneFeederStarted(); searchEvent.oneFeederStarted();
try { try {
final Response response = final Response response =
Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
final byte[] resource = (response == null) ? null : response.getContent(); final byte[] resource = (response == null) ? null : response.getContent();
//System.out.println("BLEKKO: " + UTF8.String(resource)); //System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);

@ -48,6 +48,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.LookAheadIterator;
@ -694,7 +695,7 @@ public class Segment {
try { try {
// parse the resource // parse the resource
final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay)); final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
if (document == null) { if (document == null) {
// delete just the url entry // delete just the url entry
fulltext().remove(urlhash); fulltext().remove(urlhash);

@ -40,6 +40,7 @@ import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.NumberTools;
import net.yacy.cora.util.SpaceExceededException; import net.yacy.cora.util.SpaceExceededException;
@ -142,7 +143,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
Document document; Document document;
try { try {
document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay)); document = Document.mergeDocuments(url, null, Switchboard.getSwitchboard().loader.loadDocuments(Switchboard.getSwitchboard().loader.request(url, false, reindexing), cacheStrategy, Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT));
} catch (final IOException e) { } catch (final IOException e) {
Log.logFine("snippet fetch", "load error: " + e.getMessage()); Log.logFine("snippet fetch", "load error: " + e.getMessage());
return new ArrayList<MediaSnippet>(); return new ArrayList<MediaSnippet>();

@ -204,7 +204,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
final Request request = loader == null ? null : loader.request(url, true, reindexing); final Request request = loader == null ? null : loader.request(url, true, reindexing);
Response response; Response response;
try { try {
response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, snippetMinLoadDelay); response = loader == null || request == null ? null : loader.load(request, CacheStrategy.CACHEONLY, BlacklistType.SEARCH, snippetMinLoadDelay, 3000);
} catch (IOException e1) { } catch (IOException e1) {
response = null; response = null;
} }
@ -258,7 +258,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try to load the resource from the cache // try to load the resource from the cache
Response response = null; Response response = null;
try { try {
response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, snippetMinLoadDelay); response = loader == null ? null : loader.load(loader.request(url, true, reindexing), (url.isFile() || url.isSMB()) ? CacheStrategy.NOCACHE : (cacheStrategy == null ? CacheStrategy.CACHEONLY : cacheStrategy), BlacklistType.SEARCH, snippetMinLoadDelay, 3000);
} catch (IOException e) { } catch (IOException e) {
response = null; response = null;
} }

Loading…
Cancel
Save