From 24d9db161369b34d47c3fb000d932386987c3976 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 30 Jul 2012 10:38:23 +0200 Subject: [PATCH] snippet retrieval loading processes may use a smaller minimum load time value than crawling processes. This speeds up the search result preparation dramatically. --- htroot/Bookmarks.java | 3 +- htroot/Crawler_p.java | 5 +- htroot/DictionaryLoader_p.java | 13 ++--- htroot/Load_RSS_p.java | 3 +- htroot/ViewFile.java | 3 +- htroot/ViewImage.java | 3 +- htroot/api/getpageinfo.java | 3 +- htroot/api/getpageinfo_p.java | 3 +- htroot/api/webstructure.java | 3 +- htroot/yacysearch.java | 3 +- htroot/yacysearchitem.java | 4 +- source/de/anomic/crawler/CrawlQueues.java | 3 +- source/de/anomic/crawler/RSSLoader.java | 2 +- .../de/anomic/data/ymark/YMarkAutoTagger.java | 3 +- .../de/anomic/data/ymark/YMarkMetadata.java | 3 +- .../importer/OAIListFriendsLoader.java | 5 +- .../yacy/document/importer/OAIPMHLoader.java | 3 +- source/net/yacy/peers/graphics/OSMTile.java | 3 +- .../net/yacy/peers/operation/yacyRelease.java | 3 +- .../net/yacy/repository/LoaderDispatcher.java | 54 ++++++++++--------- source/net/yacy/search/Switchboard.java | 9 ++-- source/net/yacy/search/index/Segment.java | 3 +- .../net/yacy/search/query/SnippetProcess.java | 14 +++-- .../net/yacy/search/snippet/MediaSnippet.java | 2 +- .../net/yacy/search/snippet/TextSnippet.java | 5 +- 25 files changed, 93 insertions(+), 65 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 203870d24..2d6a2273c 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -50,6 +50,7 @@ import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.kelondro.logging.Log; import net.yacy.peers.NewsPool; import net.yacy.search.Switchboard; +import net.yacy.search.snippet.TextSnippet; import de.anomic.data.BookmarkHelper; import de.anomic.data.BookmarksDB; import de.anomic.data.BookmarksDB.Bookmark; @@ -196,7 +197,7 @@ public class Bookmarks { // try to get the bookmark from the LURL database final URIMetadata urlentry = sb.index.urlMetadata().load(ASCII.getBytes(urlHash)); if (urlentry != null) try { - final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null)); + final Document document = Document.mergeDocuments(urlentry.url(), null, sb.loader.loadDocuments(sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay)); prop.put("mode_edit", "0"); // create mode prop.put("mode_url", urlentry.url().toNormalform(false, true)); prop.putHTML("mode_title", urlentry.dc_title()); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 58d8d4b4c..e74200581 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -55,6 +55,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Segment; import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.SitemapImporter; import de.anomic.crawler.ZURL.FailCategory; import de.anomic.crawler.retrieval.Request; @@ -323,7 +324,7 @@ public class Crawler_p { sb.crawlQueues.errorURL.remove(urlhash); // get a scraper to get the title - final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); + final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title(); final String description = scraper.dc_description(); @@ -551,7 +552,7 @@ public class Crawler_p { try { final DigestURI sitelistURL = new DigestURI(crawlingStart); // download document - Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); + Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); // String title = scraper.getTitle(); // String description = scraper.getDescription(); diff --git a/htroot/DictionaryLoader_p.java b/htroot/DictionaryLoader_p.java index 436d4ea20..d14f0efdd 100644 --- a/htroot/DictionaryLoader_p.java +++ b/htroot/DictionaryLoader_p.java @@ -30,6 +30,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.retrieval.Response; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -65,7 +66,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON0.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON0.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON0.file(), null, -1)); @@ -107,7 +108,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON1.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON1.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON1.file(), null, -1)); @@ -149,7 +150,7 @@ public class DictionaryLoader_p { if (post.containsKey("geon2Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEON2.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEON2.file()); LibraryProvider.geoLoc.activateLocation(LibraryProvider.Dictionary.GEON2.nickname, new GeonamesLocation(LibraryProvider.Dictionary.GEON2.file(), null, 100000)); @@ -191,7 +192,7 @@ public class DictionaryLoader_p { if (post.containsKey("geo1Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.GEODB1.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.GEODB1.file()); LibraryProvider.geoLoc.deactivateLocalization(LibraryProvider.Dictionary.GEODB1.nickname); @@ -234,7 +235,7 @@ public class DictionaryLoader_p { if (post.containsKey("drw0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.DRW0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.DRW0.file()); LibraryProvider.activateDeReWo(); @@ -278,7 +279,7 @@ public class DictionaryLoader_p { if (post.containsKey("pnd0Load")) { // load from the net try { - final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = sb.loader.load(sb.loader.request(new DigestURI(LibraryProvider.Dictionary.PND0.url), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); final byte[] b = response.getContent(); FileUtils.copy(b, LibraryProvider.Dictionary.PND0.file()); LibraryProvider.activatePND(); diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 227fe3332..5d50795c2 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -43,6 +43,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.RSSLoader; import de.anomic.crawler.retrieval.Response; import de.anomic.data.WorkTables; @@ -256,7 +257,7 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true, false)); - final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER); + final Response response = sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final IOException e) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 96bc551af..b23ff18a2 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -58,6 +58,7 @@ import net.yacy.search.index.Segment; import com.hp.hpl.jena.rdf.model.Model; import de.anomic.crawler.Cache; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.retrieval.Response; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -164,7 +165,7 @@ public class ViewFile { Response response = null; try { - response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null); + response = sb.loader.load(sb.loader.request(url, true, false), authorized ? CacheStrategy.IFEXIST : CacheStrategy.CACHEONLY, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { prop.put("error", "4"); prop.put("error_errorText", "error loading resource: " + e.getMessage()); diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index 8ae8dced5..c3d82b70b 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -44,6 +44,7 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlQueues; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -96,7 +97,7 @@ public class ViewImage { if (image == null) { byte[] resourceb = null; if (url != null) try { - resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH); + resourceb = sb.loader.loadContent(sb.loader.request(url, false, true), CacheStrategy.IFEXIST, BlacklistType.SEARCH, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { Log.logFine("ViewImage", "cannot load: " + e.getMessage()); } diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index c7fae06a4..dc74ed3d5 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -45,6 +45,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.RobotsTxtEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -95,7 +96,7 @@ public class getpageinfo { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index 185b4d524..6cba92e09 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -45,6 +45,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.RobotsTxtEntry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -95,7 +96,7 @@ public class getpageinfo_p { } net.yacy.document.Document scraper = null; if (u != null) try { - scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER); + scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { Log.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 3ae74aad3..a640e54f0 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -41,6 +41,7 @@ import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.peers.graphics.WebStructureGraph; import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlQueues; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -97,7 +98,7 @@ public class webstructure { prop.put("references", 1); net.yacy.document.Document scraper = null; if (url != null) try { - scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null); + scraper = sb.loader.loadDocument(url, CacheStrategy.IFEXIST, null, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { Log.logException(e); } diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 10df99bae..99dd67395 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -80,6 +80,7 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.query.SnippetProcess; import net.yacy.search.ranking.RankingProfile; +import net.yacy.search.snippet.TextSnippet; import de.anomic.data.DidYouMean; import de.anomic.data.UserDB; import de.anomic.data.ymark.YMarkTables; @@ -668,7 +669,7 @@ public class yacysearch { sb.loader.loadDocuments( sb.loader.request(urlentry.url(), true, false), CacheStrategy.IFEXIST, - Integer.MAX_VALUE, BlacklistType.SEARCH); + Integer.MAX_VALUE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index c215d2fa5..5e300680a 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -183,7 +183,7 @@ public class yacysearchitem { // END interaction prop.putHTML("content_target", target); - if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null); + if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading prop.put("content_urlhash", resulthashString); prop.put("content_ranking", result.ranking); @@ -266,7 +266,7 @@ public class yacysearchitem { final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self"); final String license = sb.licensedURLs.aquireLicense(ms.url()); - sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null); + sb.loader.loadIfNotExistBackground(ms.url(), 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay); prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring); prop.putHTML("content_item_href", resultUrlstring); prop.putHTML("content_item_target", target); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 88037837d..d7dc86969 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -60,6 +60,7 @@ import de.anomic.crawler.retrieval.Response; public class CrawlQueues { + public static final long queuedMinLoadDelay = 500; private static final String ERROR_DB_FILENAME = "urlError4.db"; private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db"; @@ -656,7 +657,7 @@ public class CrawlQueues { try { this.request.setStatus("loading", WorkflowJob.STATUS_RUNNING); final CrawlProfile e = CrawlQueues.this.sb.crawler.getActive(UTF8.getBytes(this.request.profileHandle())); - final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER); + final Response response = CrawlQueues.this.sb.loader.load(this.request, e == null ? CacheStrategy.IFEXIST : e.cacheStrategy(), BlacklistType.CRAWLER, queuedMinLoadDelay); if (response == null) { this.request.setStatus("error", WorkflowJob.STATUS_FINISHED); if (CrawlQueues.this.log.isFine()) { diff --git a/source/de/anomic/crawler/RSSLoader.java b/source/de/anomic/crawler/RSSLoader.java index 5e2f7cc81..fcd6ee48d 100644 --- a/source/de/anomic/crawler/RSSLoader.java +++ b/source/de/anomic/crawler/RSSLoader.java @@ -63,7 +63,7 @@ public class RSSLoader extends Thread { public void run() { RSSReader rss = null; try { - final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER); + final Response response = this.sb.loader.load(this.sb.loader.request(this.urlf, true, false), CacheStrategy.NOCACHE, Integer.MAX_VALUE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); final byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (final MalformedURLException e) { diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index bd9070561..387b31b41 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -22,6 +22,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.repository.LoaderDispatcher; +import net.yacy.search.snippet.TextSnippet; import de.anomic.crawler.retrieval.Response; public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler { @@ -68,7 +69,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle return null; } try { - response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); + response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); } catch (final IOException e) { Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url); return null; diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java index 20d1719aa..4e4ad09c7 100644 --- a/source/de/anomic/data/ymark/YMarkMetadata.java +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -39,6 +39,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadata; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segment; +import net.yacy.search.snippet.TextSnippet; import de.anomic.crawler.retrieval.Response; public class YMarkMetadata { @@ -97,7 +98,7 @@ public class YMarkMetadata { public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure { if(this.document == null) { Response response = null; - response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); + response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); } return this.document; diff --git a/source/net/yacy/document/importer/OAIListFriendsLoader.java b/source/net/yacy/document/importer/OAIListFriendsLoader.java index f7b275f97..fea48c6f6 100644 --- a/source/net/yacy/document/importer/OAIListFriendsLoader.java +++ b/source/net/yacy/document/importer/OAIListFriendsLoader.java @@ -45,6 +45,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; +import net.yacy.search.snippet.TextSnippet; import org.xml.sax.Attributes; import org.xml.sax.SAXException; @@ -62,7 +63,7 @@ public class OAIListFriendsLoader implements Serializable { listFriends.putAll(moreFriends); if (loader != null) for (final Map.Entry oaiFriend: listFriends.entrySet()) { try { - loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null); + loader.loadIfNotExistBackground(new DigestURI(oaiFriend.getKey()), oaiFriend.getValue(), Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); } catch (final MalformedURLException e) { } } @@ -87,7 +88,7 @@ public class OAIListFriendsLoader implements Serializable { Map m; for (final Map.Entry oaiFriend: listFriends.entrySet()) try { if (!oaiFriend.getValue().exists()) { - final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + final Response response = loader == null ? null : loader.load(loader.request(new DigestURI(oaiFriend.getKey()), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); if (response != null) FileUtils.copy(response.getContent(), oaiFriend.getValue()); } diff --git a/source/net/yacy/document/importer/OAIPMHLoader.java b/source/net/yacy/document/importer/OAIPMHLoader.java index 0296eee40..e5eeee2b2 100644 --- a/source/net/yacy/document/importer/OAIPMHLoader.java +++ b/source/net/yacy/document/importer/OAIPMHLoader.java @@ -30,6 +30,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; import net.yacy.repository.LoaderDispatcher; +import net.yacy.search.snippet.TextSnippet; import de.anomic.crawler.retrieval.Response; @@ -54,7 +55,7 @@ public class OAIPMHLoader { for (int i = 0; i < 5; i++) { // make some retries if first attempt fails try { - response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null); + response = loader.load(loader.request(source, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); break; } catch (IOException e) { Log.logWarning("OAIPMHLoader", "loading failed at attempt " + (i + 1) + ": " + source.toNormalform(true, false)); diff --git a/source/net/yacy/peers/graphics/OSMTile.java b/source/net/yacy/peers/graphics/OSMTile.java index 0463da8d5..bc202c8ec 100644 --- a/source/net/yacy/peers/graphics/OSMTile.java +++ b/source/net/yacy/peers/graphics/OSMTile.java @@ -38,6 +38,7 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; +import net.yacy.search.snippet.TextSnippet; import net.yacy.visualization.RasterPlotter; import de.anomic.crawler.Cache; import de.anomic.crawler.retrieval.Response; @@ -112,7 +113,7 @@ public class OSMTile { // download resource using the crawler and keep resource in memory if possible Response entry = null; try { - entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null); + entry = Switchboard.getSwitchboard().loader.load(Switchboard.getSwitchboard().loader.request(tileURL, false, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, null, TextSnippet.snippetMinLoadDelay); } catch (final IOException e) { Log.logWarning("OSMTile", "cannot load: " + e.getMessage()); return null; diff --git a/source/net/yacy/peers/operation/yacyRelease.java b/source/net/yacy/peers/operation/yacyRelease.java index 5f2af9c6f..631379619 100644 --- a/source/net/yacy/peers/operation/yacyRelease.java +++ b/source/net/yacy/peers/operation/yacyRelease.java @@ -63,6 +63,7 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; import net.yacy.peers.Network; import net.yacy.search.Switchboard; +import de.anomic.crawler.CrawlQueues; import de.anomic.server.serverCore; import de.anomic.tools.CryptoLib; import de.anomic.tools.SignatureOutputStream; @@ -240,7 +241,7 @@ public final class yacyRelease extends yacyVersion { try { final DigestURI uri = location.getLocationURL(); Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump - scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null); + scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE, null, CrawlQueues.queuedMinLoadDelay); } catch (final IOException e) { return null; } diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 28361c1e8..f863aa90c 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -66,7 +66,6 @@ import de.anomic.crawler.retrieval.SMBLoader; public final class LoaderDispatcher { - private static final long minDelay = 250; // milliseconds; 4 accesses per second private static final ConcurrentHashMap accessTime = new ConcurrentHashMap(); // to protect targets from DDoS private final Switchboard sb; @@ -133,9 +132,9 @@ public final class LoaderDispatcher { 0); } - public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType) throws IOException { + public void load(final DigestURI url, final CacheStrategy cacheStratgy, final int maxFileSize, final File targetFile, BlacklistType blacklistType, final long minDelay) throws IOException { - final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType).getContent(); + final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, blacklistType, minDelay).getContent(); if (b == null) throw new IOException("load == null"); final File tmp = new File(targetFile.getAbsolutePath() + ".tmp"); @@ -146,11 +145,11 @@ public final class LoaderDispatcher { tmp.renameTo(targetFile); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType) throws IOException { - return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType); + public Response load(final Request request, final CacheStrategy cacheStrategy, final BlacklistType blacklistType, final long minDelay) throws IOException { + return load(request, cacheStrategy, protocolMaxFileSize(request.url()), blacklistType, minDelay); } - public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException { + public Response load(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { Semaphore check = this.loaderSteering.get(request.url()); if (check != null) { // a loading process may be going on for that url @@ -161,7 +160,7 @@ public final class LoaderDispatcher { this.loaderSteering.put(request.url(), new Semaphore(0)); try { - final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType); + final Response response = loadInternal(request, cacheStrategy, maxFileSize, blacklistType, minDelay); check = this.loaderSteering.remove(request.url()); if (check != null) check.release(1000); return response; @@ -181,7 +180,7 @@ public final class LoaderDispatcher { * @return the loaded entity in a Response object * @throws IOException */ - private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType) throws IOException { + private Response loadInternal(final Request request, CacheStrategy cacheStrategy, final int maxFileSize, final BlacklistType blacklistType, final long minDelay) throws IOException { // get the protocol of the next URL final DigestURI url = request.url(); if (url.isFile() || url.isSMB()) cacheStrategy = CacheStrategy.NOCACHE; // load just from the file system @@ -264,8 +263,11 @@ public final class LoaderDispatcher { // force a sleep here. Instead just sleep we clean up the accessTime map final long untilTime = System.currentTimeMillis() + wait; cleanupAccessTimeTable(untilTime); - if (System.currentTimeMillis() < untilTime) - try {Thread.sleep(untilTime - System.currentTimeMillis());} catch (final InterruptedException ee) {} + if (System.currentTimeMillis() < untilTime) { + long frcdslp = untilTime - System.currentTimeMillis(); + this.log.logInfo("Forcing sleep of " + frcdslp + " ms for host " + host); + try {Thread.sleep(frcdslp);} catch (final InterruptedException ee) {} + } } } @@ -330,19 +332,19 @@ public final class LoaderDispatcher { * @return the content as {@link byte[]} * @throws IOException */ - public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException { + public byte[] loadContent(final Request request, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { // try to download the resource using the loader - final Response entry = load(request, cacheStrategy, blacklistType); + final Response entry = load(request, cacheStrategy, blacklistType, minDelay); if (entry == null) return null; // not found in web // read resource body (if it is there) return entry.getContent(); } - public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType) throws IOException, Parser.Failure { + public Document[] loadDocuments(final Request request, final CacheStrategy cacheStrategy, final int maxFileSize, BlacklistType blacklistType, final long minDelay) throws IOException, Parser.Failure { // load resource - final Response response = load(request, cacheStrategy, maxFileSize, blacklistType); + final Response response = load(request, cacheStrategy, maxFileSize, blacklistType, minDelay); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -353,10 +355,10 @@ public final class LoaderDispatcher { return response.parse(); } - public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType) throws IOException { + public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy, BlacklistType blacklistType, final long minDelay) throws IOException { // load resource Request request = request(location, true, false); - final Response response = this.load(request, cachePolicy, blacklistType); + final Response response = this.load(request, cachePolicy, blacklistType, minDelay); final DigestURI url = request.url(); if (response == null) throw new IOException("no Response for url " + url); @@ -379,8 +381,8 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType) throws IOException { - final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType); + public final Map loadLinks(final DigestURI url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) throws IOException { + final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, minDelay); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); if (response.getContent() == null) throw new IOException("resource == null"); @@ -405,16 +407,16 @@ public final class LoaderDispatcher { while (i.hasNext()) { e = i.next(); if (System.currentTimeMillis() > timeout) break; - if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove(); + if (System.currentTimeMillis() - e.getValue().longValue() > 1000) i.remove(); } } - public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType) { - new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start(); + public void loadIfNotExistBackground(final DigestURI url, final File cache, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { + new Loader(url, cache, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); } - public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType) { - new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType).start(); + public void loadIfNotExistBackground(final DigestURI url, final int maxFileSize, BlacklistType blacklistType, final long minDelay) { + new Loader(url, null, maxFileSize, CacheStrategy.IFEXIST, blacklistType, minDelay).start(); } private class Loader extends Thread { @@ -424,13 +426,15 @@ public final class LoaderDispatcher { private final int maxFileSize; private final CacheStrategy cacheStrategy; private final BlacklistType blacklistType; + private final long minDelay; - public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType) { + public Loader(final DigestURI url, final File cache, final int maxFileSize, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final long minDelay) { this.url = url; this.cache = cache; this.maxFileSize = maxFileSize; this.cacheStrategy = cacheStrategy; this.blacklistType = blacklistType; + this.minDelay = minDelay; } @Override @@ -438,7 +442,7 @@ public final class LoaderDispatcher { if (this.cache != null && this.cache.exists()) return; try { // load from the net - final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType); + final Response response = load(request(this.url, false, true), this.cacheStrategy, this.maxFileSize, this.blacklistType, this.minDelay); final byte[] b = response.getContent(); if (this.cache != null) FileUtils.copy(b, this.cache); } catch (final MalformedURLException e) {} catch (final IOException e) {} diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1f49a50e5..c17d0ca2b 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -152,6 +152,7 @@ import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; import net.yacy.search.ranking.BlockRank; import net.yacy.search.ranking.RankingProfile; +import net.yacy.search.snippet.TextSnippet; import com.google.common.io.Files; @@ -2675,7 +2676,7 @@ public final class Switchboard extends serverSwitch Thread.currentThread().setName("Switchboard.addToIndex:" + urls); try { final Response response = - Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER); + Switchboard.this.loader.load(request, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); if ( response == null ) { throw new IOException("response == null"); } @@ -3076,7 +3077,7 @@ public final class Switchboard extends serverSwitch final Map links; searchEvent.getRankingResult().oneFeederStarted(); try { - links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH); + links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if ( links != null ) { final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { @@ -3115,7 +3116,7 @@ public final class Switchboard extends serverSwitch final Map links; DigestURI url; try { - links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH); + links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages final Iterator i = links.keySet().iterator(); @@ -3179,7 +3180,7 @@ public final class Switchboard extends serverSwitch searchEvent.getRankingResult().oneFeederStarted(); try { final Response response = - sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH); + sb.loader.load(sb.loader.request(url, true, false), CacheStrategy.NOCACHE, BlacklistType.SEARCH, TextSnippet.snippetMinLoadDelay); final byte[] resource = (response == null) ? null : response.getContent(); //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index dc0fe7f25..206008870 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -69,6 +69,7 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.Switchboard; import net.yacy.search.query.RWIProcess; import net.yacy.search.query.SearchEvent; +import de.anomic.crawler.CrawlQueues; import de.anomic.crawler.retrieval.Response; public class Segment { @@ -571,7 +572,7 @@ public class Segment { try { // parse the resource - final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null)); + final Document document = Document.mergeDocuments(entry.url(), null, loader.loadDocuments(loader.request(entry.url(), true, false), cacheStrategy, Integer.MAX_VALUE, null, CrawlQueues.queuedMinLoadDelay)); if (document == null) { // delete just the url entry urlMetadata().remove(urlhash); diff --git a/source/net/yacy/search/query/SnippetProcess.java b/source/net/yacy/search/query/SnippetProcess.java index aec342789..a83779fd6 100644 --- a/source/net/yacy/search/query/SnippetProcess.java +++ b/source/net/yacy/search/query/SnippetProcess.java @@ -26,6 +26,7 @@ package net.yacy.search.query; +import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -501,8 +502,13 @@ public class SnippetProcess { SolrDocument sd = null; StringBuilder querystring = new StringBuilder(17); querystring.append(SolrField.id.getSolrFieldName()).append(':').append('"').append(ASCII.String(page.hash())).append('"'); - final SolrDocumentList sdl = this.solr.query(querystring.toString(), 0, 1); - if (!sdl.isEmpty()) { + SolrDocumentList sdl = null; + try { + sdl = this.solr.query(querystring.toString(), 0, 1); + } catch (IOException e) { + Log.logException(e); + } + if (sdl != null && !sdl.isEmpty()) { sd = sdl.get(0); } if (sd != null) { @@ -537,9 +543,7 @@ public class SnippetProcess { Log.logWarning("SnippetProcess", "worker ended with timeout"); } //System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops); - } catch (final Exception e) { - Log.logException(e); - } + } catch (final Exception e) { Log.logException(e); } //Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated"); } diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index 74a93acc3..84450d88a 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -143,7 +143,7 @@ public class MediaSnippet implements Comparable, Comparator(); diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 4070caf6e..9f55a2cad 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -60,6 +60,7 @@ import de.anomic.crawler.retrieval.Response; public class TextSnippet implements Comparable, Comparator { + public static final long snippetMinLoadDelay = 10; private static final int MAX_CACHE = 1000; @@ -213,7 +214,7 @@ public class TextSnippet implements Comparable, Comparator, Comparator