diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 881e14dd6..7c942a88a 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -41,6 +41,7 @@ import java.util.Set; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.data.BookmarkHelper; @@ -201,7 +202,7 @@ public class Bookmarks { // try to get the bookmark from the LURL database final URIMetadataNode urlentry = sb.index.fulltext().getMetadata(ASCII.getBytes(urlHash)); if (urlentry != null) try { - final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay)); + final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT)); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", urlentry.url().toNormalform(false)); prop.putHTML("mode_title", urlentry.dc_title()); diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java index 3a7f4689a..7e89a066f 100644 --- a/htroot/CrawlCheck_p.java +++ b/htroot/CrawlCheck_p.java @@ -25,6 +25,7 @@ import java.util.Set; import java.util.regex.Pattern; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.retrieval.Request; @@ -99,7 +100,7 @@ public class CrawlCheck_p { // try to load the url if (robotsAllowed) try { Request request = sb.loader.request(u, true, false); - final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); if (response == null) { prop.put("table_list_" + row + "_access", "no response"); } else { diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 737d3feb7..e57664428 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -37,6 +37,7 @@ import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.ASCII; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlProfile; @@ -280,7 +281,7 @@ public class Crawler_p { // download document Document scraper; try { - scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); // get links and generate filter for (DigestURI u: scraper.getAnchors().keySet()) { newRootURLs.add(u); diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 047e4d804..230d97648 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -24,6 +24,7 @@ import java.net.MalformedURLException; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeonamesLocation; import net.yacy.cora.geo.OpenGeoDBLocation; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.retrieval.Response; @@ -66,7 +67,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1)); @@ -108,7 +109,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1)); @@ -150,7 +151,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon2Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000)); @@ -192,7 +193,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname); @@ -235,7 +236,7 @@ public class DictionaryLoader_p { if (post.containsKey("drw0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); LibraryProvider.activateDeReWo(); @@ -279,7 +280,7 @@ public class DictionaryLoader_p { if (post.containsKey("pnd0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file()); LibraryProvider.activatePND(); diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 99bce927f..8908cf7cb 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.SpaceExceededException; @@ -266,7 +267,7 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true)); - final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final IOException e) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index da65a4a9d..b7ab8ec67 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -42,6 +42,7 @@ import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.lod.JenaTripleStore; import net.yacy.cora.lod.vocabulary.YaCyMetadata; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.Cache; import net.yacy.crawler.data.CrawlQueues; @@ -164,7 +165,7 @@ public class ViewFile { Response response = null; try { - response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { prop.put("error", "4"); prop.put("error_errorText", "error loading resource: " + e.getMessage()); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 9c45f7783..0e3cf8e02 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -33,6 +33,7 @@ import java.net.MalformedURLException; import java.util.Map; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -104,7 +105,7 @@ public class ViewImage { if (image == null) { byte[] resourceb = null; if (url != null) try { - resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay); + resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { Log.logFine("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index 566e26b5e..9f688b7b0 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -34,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.robots.RobotsTxtEntry; @@ -96,7 +97,7 @@ public class getpageinfo { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index f071b57cd..46e3a77f5 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -34,6 +34,7 @@ import javax.xml.parsers.ParserConfigurationException; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.robots.RobotsTxtEntry; @@ -96,7 +97,7 @@ public class getpageinfo_p { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 5d834b87e..3e7f1bede 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -32,6 +32,7 @@ import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.crawler.data.CrawlQueues; import net.yacy.kelondro.data.citation.CitationReference; @@ -97,7 +98,7 @@ public class webstructure { prop.put("references", 1); net.yacy.document.Document scraper = null; if (url != null) try { - scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay); + scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { Log.logException(e); } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 824c80f6d..2e03c3a56 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -49,6 +49,7 @@ import net.yacy.cora.federate.opensearch.OpenSearchConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.lod.vocabulary.Tagging; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -569,7 +570,7 @@ public class yacysearch { sb.loader.loadDocuments( sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, - Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 2f4486092..e769a7225 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -32,6 +32,7 @@ import net.yacy.cora.document.ASCII; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; @@ -187,7 +188,7 @@ public class yacysearchitem { // END interaction prop.putHTML("content_target", target); - if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); + if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); prop.put("content_ranking", result.ranking()); @@ -271,7 +272,7 @@ public class yacysearchitem { final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final String license = URLLicense.aquireLicense(ms.url()); - sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); + sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring); prop.putHTML("content_item_href", resultUrlstring); prop.putHTML("content_item_target", target); diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index bf6491d75..b245d98de 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -421,40 +421,40 @@ public class Balancer { long sleeptime = 0; Request crawlEntry = null; CrawlProfile profileEntry = null; - synchronized (this) { - byte[] failhash = null; - while (!this.urlFileIndex.isEmpty()) { - byte[] nexthash = getbest(robots); + byte[] failhash = null; + while (!this.urlFileIndex.isEmpty()) { + byte[] nexthash = getbest(robots); + synchronized (this) { if (nexthash == null) return null; - - Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash); - if (rowEntry == null) continue; - + + Row.Entry rowEntry = (nexthash == null) ? null : this.urlFileIndex.remove(nexthash); + if (rowEntry == null) continue; + crawlEntry = new Request(rowEntry); //Log.logInfo("Balancer", "fetched next url: " + crawlEntry.url().toNormalform(true, false)); - - // check blacklist (again) because the user may have created blacklist entries after the queue has been filled - if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { - Log.logFine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist."); - continue; - } - - // at this point we must check if the crawlEntry has relevance because the crawl profile still exists - // if not: return null. A calling method must handle the null value and try again - profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); - if (profileEntry == null) { - Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); - continue; - } - // depending on the caching policy we need sleep time to avoid DoS-like situations - sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url()); - - assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); - assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); - - if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops - break; - } + + // check blacklist (again) because the user may have created blacklist entries after the queue has been filled + if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) { + Log.logFine("CRAWLER", "URL '" + crawlEntry.url() + "' is in blacklist."); + continue; + } + + // at this point we must check if the crawlEntry has relevance because the crawl profile still exists + // if not: return null. A calling method must handle the null value and try again + profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle())); + if (profileEntry == null) { + Log.logWarning("Balancer", "no profile entry for handle " + crawlEntry.profileHandle()); + continue; + } + // depending on the caching policy we need sleep time to avoid DoS-like situations + sleeptime = getDomainSleepTime(robots, profileEntry, crawlEntry.url()); + + assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes()); + assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash()); + + if (failhash != null && Base64Order.enhancedCoder.equal(failhash, nexthash)) break; // prevent endless loops + break; + } } if (crawlEntry == null) return null; diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index e6785dd8f..35e77b5c4 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -41,6 +41,7 @@ import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ConnectionInfo; import net.yacy.crawler.HarvestProcess; import net.yacy.crawler.data.NoticedURL.StackType; @@ -653,7 +654,7 @@ public class CrawlQueues { try { this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); - final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay); + final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); if (response == null) { this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (CrawlQueues.this.log.isFine()) { diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index cac26506c..e12e9693e 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -70,15 +70,15 @@ public final class HTTPLoader { this.socketTimeout = (int) sb.getConfigLong("crawler.clientTimeout", 30000); } - public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType) throws IOException { + public Response load(final Request entry, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException { Latency.updateBeforeLoad(entry.url()); final long start = System.currentTimeMillis(); - final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType); + final Response doc = load(entry, DEFAULT_CRAWLING_RETRY_COUNT, maxFileSize, blacklistType, timeout); Latency.updateAfterLoad(entry.url(), System.currentTimeMillis() - start); return doc; } - private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType) throws IOException { + private Response load(final Request request, final int retryCount, final int maxFileSize, final BlacklistType blacklistType, int timeout) throws IOException { byte[] myHash = ASCII.getBytes(this.sb.peers.mySeed().hash); @@ -127,7 +127,7 @@ public final class HTTPLoader { requestHeader.put(HeaderFramework.ACCEPT_ENCODING, this.sb.getConfig("crawler.http.acceptEncoding", DEFAULT_ENCODING)); // HTTP-Client - final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), ClientIdentification.DEFAULT_TIMEOUT); + final HTTPClient client = new HTTPClient(ClientIdentification.getUserAgent(), timeout); client.setRedirecting(false); // we want to handle redirection ourselves, so we don't index pages twice client.setTimout(this.socketTimeout); client.setHeader(requestHeader.entrySet()); @@ -180,7 +180,7 @@ public final class HTTPLoader { // retry crawling with new url request.redirectURL(redirectionUrl); - return load(request, retryCount - 1, maxFileSize, blacklistType); + return load(request, retryCount - 1, maxFileSize, blacklistType, timeout); } // we don't want to follow redirects this.sb.crawlQueues.errorURL.push(request, myHash, new Date(), 1, FailCategory.FINAL_PROCESS_CONTEXT, "redirection not wanted", statusCode); diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index a2879b211..15e723679 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -39,6 +39,7 @@ import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ComparableARC; import net.yacy.cora.util.SpaceExceededException; @@ -70,7 +71,7 @@ public class RSSLoader extends Thread { public void run() { RSSReader rss = null; try { - final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final MalformedURLException e) { diff --git a/source/net/yacy/crawler/robots/RobotsTxt.java b/source/net/yacy/crawler/robots/RobotsTxt.java index dce3ab687..4cf4d5127 100644 --- a/source/net/yacy/crawler/robots/RobotsTxt.java +++ b/source/net/yacy/crawler/robots/RobotsTxt.java @@ -165,7 +165,7 @@ public class RobotsTxt { if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); Request request = new Request(robotsURL, null); try { - response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0); + response = this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000); } catch (Throwable e) { log.info("Trying to download the robots.txt file from URL '" + robotsURL + "' failed - " + e.getMessage()); response = null; @@ -221,7 +221,7 @@ public class RobotsTxt { if (log.isDebugEnabled()) log.debug("Trying to download the robots.txt file from URL '" + robotsURL + "'."); Request request = new Request(robotsURL, null); try { - response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0); + response = RobotsTxt.this.loader.load(request, CacheStrategy.NOCACHE, null, 0, 3000); } catch (IOException e) { response = null; } diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 78ba98851..92ddcf083 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -11,6 +11,7 @@ import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -67,7 +68,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to malformed url: "+url); return null; } - response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); try { return Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } catch (final Failure e) { diff --git a/source/net/yacy/data/ymark/YMarkMetadata.java b/source/net/yacy/data/ymark/YMarkMetadata.java index 8c8f98737..e14812d4f 100644 --- a/source/net/yacy/data/ymark/YMarkMetadata.java +++ b/source/net/yacy/data/ymark/YMarkMetadata.java @@ -33,6 +33,7 @@ import java.util.EnumMap; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -98,7 +99,7 @@ public class YMarkMetadata { public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure { if(this.document == null) { Response response = null; - response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } return this.document; diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index 418e13130..8f432c206 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -41,6 +41,7 @@ import javax.xml.parsers.SAXParserFactory; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -63,7 +64,7 @@ public class OAIListFriendsLoader implements Serializable { listFriends.putAll(moreFriends); if (loader != null) for (final Map.Entry oaiFriend: listFriends.entrySet()) { try { - loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final MalformedURLException e) { } } @@ -88,7 +89,7 @@ public class OAIListFriendsLoader implements Serializable { Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index 52faa86e7..ca00a6df0 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -26,6 +26,7 @@ import java.io.File; import java.io.IOException; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; @@ -55,7 +56,7 @@ public class OAIPMHLoader { for (int i = 0; i < 5; i++) { // make some retries if first attempt fails try { - response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); break; } catch (IOException e) { Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true)); diff --git a/source/net/yacy/peers/graphics/OSMTile.java b/source/net/yacy/peers/graphics/OSMTile.java index b5466b489..e9b681cc0 100644 --- a/source/net/yacy/peers/graphics/OSMTile.java +++ b/source/net/yacy/peers/graphics/OSMTile.java @@ -35,6 +35,7 @@ import java.util.List; import javax.imageio.ImageIO; import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.data.Cache; import net.yacy.crawler.retrieval.Response; import net.yacy.kelondro.data.meta.DigestURI; @@ -113,7 +114,7 @@ public class OSMTile { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { - entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); + entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { Log.logWarning("OSMTile", "cannot load: " + e.getMessage()); return null; diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 742e8b243..c627581ec 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -239,7 +239,7 @@ public final class yacyRelease extends yacyVersion { try { final DigestURI uri = location.getLocationURL(); Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump - scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay); + scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (final IOException e) { return null; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 7a8b1717f..0453d3b1b 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -132,9 +132,9 @@ public final class LoaderDispatcher { 0); } - public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay) throws IOException { + public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { - final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay).getContent(); + final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay, timeout).getContent(); if (b == null) throw new IOException("load == null"); final File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -145,11 +145,11 @@ public final class LoaderDispatcher { tmp.renameTo(targetFile); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay) throws IOException { - return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay); + public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { + return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay, timeout); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { + public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { Semaphore check = this.loaderSteering.get(request.url()); if (check != null) { // a loading process may be going on for that url @@ -160,7 +160,7 @@ public final class LoaderDispatcher { this.loaderSteering.put(request.url(), new Semaphore(0)); try { - final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay); + final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout); check = this.loaderSteering.remove(request.url()); if (check != null) check.release(1000); return response; @@ -180,7 +180,7 @@ public final class LoaderDispatcher { * @return the loaded entity in a Response object * @throws IOException */ - private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { + private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { // get the protocol of the next URL final DigestURI url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system @@ -280,7 +280,7 @@ public final class LoaderDispatcher { // load resource from the internet Response response = null; if (protocol.equals("http") || protocol.equals("https")) { - response = this.httpLoader.load(request, maxFileSize, blacklistType); + response = this.httpLoader.load(request, maxFileSize, blacklistType, timeout); } else if (protocol.equals("ftp")) { response = this.ftpLoader.load(request, true); } else if (protocol.equals("smb")) { @@ -335,19 +335,19 @@ public final class LoaderDispatcher { * @return the content as {@link byte[]} * @throws IOException */ - public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { + public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { // try to download the resource using the loader - final Response entry = load(request, cacheStrategy, blacklistType, minDelay); + final Response entry = load(request, cacheStrategy, blacklistType, minDelay, timeout); if (entry == null) return null; // not found in web // read resource body (if it is there) return entry.getContent(); } - public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay) throws IOException, Parser.Failure { + public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException, Parser.Failure { // load resource - final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay); + final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay, timeout); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -358,10 +358,10 @@ public final class LoaderDispatcher { return response.parse(); } - public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay) throws IOException { + public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { // load resource Request request = request(location, true, false); - final Response response = this.load(request, cachePolicy, blacklistType, minDelay); + final Response response = this.load(request, cachePolicy, blacklistType, minDelay, timeout); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -384,8 +384,8 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { - final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay); + public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, int timeout) throws IOException { + final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay, timeout); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); if (response.getContent() == null) throw new IOException("resource == null"); @@ -414,12 +414,12 @@ public final class LoaderDispatcher { } } - public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { - new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); + public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay, final int timeout) { + new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start(); } - public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { - new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); + public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay, int timeout) { + new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay, timeout).start(); } private class Loader extends Thread { @@ -430,14 +430,16 @@ public final class LoaderDispatcher { private final CacheStrategy cacheStrategy; private final BlacklistType blacklistType; private final long minDelay; + private final int timeout; - public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) { + public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay, final int timeout) { this.url = url; this.cache = cache; this.maxFileSize = maxFileSize; this.cacheStrategy = cacheStrategy; this.blacklistType = blacklistType; this.minDelay = minDelay; + this.timeout = timeout; } @Override @@ -445,7 +447,7 @@ public final class LoaderDispatcher { if (this.cache != null && this.cache.exists()) return; try { // load from the net - final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay); + final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay, this.timeout); final byte[] b = response.getContent(); if (this.cache != null) FileUtils.copy(b, this.cache); } catch (final MalformedURLException e) {} catch (final IOException e) {} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1c6de3277..0aaae2a62 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2878,7 +2878,7 @@ public final class Switchboard extends serverSwitch { // get a scraper to get the title Document scraper; try { - scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + scraper = this.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); } catch (IOException e) { return "scraper cannot load URL: " + e.getMessage(); } @@ -2985,7 +2985,7 @@ public final class Switchboard extends serverSwitch { String urlName = url.toNormalform(true); Thread.currentThread().setName("Switchboard.addToIndex:" + urlName); try { - final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + final Response response = Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); if (response == null) { throw new IOException("response == null"); } @@ -3372,7 +3372,7 @@ public final class Switchboard extends serverSwitch { final Map links; searchEvent.oneFeederStarted(); try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3411,7 +3411,7 @@ public final class Switchboard extends serverSwitch { final Map links; DigestURI url; try { - links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, 2000); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); @@ -3476,7 +3476,7 @@ public final class Switchboard extends serverSwitch { searchEvent.oneFeederStarted(); try { final Response response = - Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); + Switchboard.this.loader.load(Switchboard.this.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); final byte[] resource = (response == null) ? null : response.getContent(); //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index b7baf841f..bd10a7ab1 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -48,6 +48,7 @@ import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.LookAheadIterator; @@ -694,7 +695,7 @@ public class Segment { try { // parse the resource - final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay)); + final Document document = Document.mergeDocuments(url, null, loader.loadDocuments(loader.request(url, true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT)); if (document == null) { // delete just the url entry fulltext().remove(urlhash); diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 351455a12..88376b058 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -40,6 +40,7 @@ import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.NumberTools; import net.yacy.cora.util.SpaceExceededException; @@ -142,7 +143,7 @@ public class MediaSnippet implements Comparable, Comparator(); diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 8ef802a1b..561f118b2 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -204,7 +204,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator