From 634a01a9a4c5bc879320de863d90080a2c5779b7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 24 Jul 2009 14:52:27 +0000 Subject: [PATCH] replaced wget-requests with caching requests git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6242 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/api/util/getpageinfo_p.java | 7 +- .../crawler/retrieval/LoaderDispatcher.java | 79 +++++++++++++------ .../document/parser/html/ContentScraper.java | 24 ++---- source/de/anomic/http/client/Client.java | 28 +------ source/de/anomic/yacy/yacyRelease.java | 3 +- 5 files changed, 67 insertions(+), 74 deletions(-) diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index cd1f4df94..d265da13c 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -3,9 +3,8 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Set; -import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.CrawlProfile; import de.anomic.document.parser.html.ContentScraper; -import de.anomic.http.metadata.HeaderFramework; import de.anomic.http.metadata.RequestHeader; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; @@ -43,9 +42,7 @@ public class getpageinfo_p { if (actions.indexOf("title")>=0) { try { final yacyURL u = new yacyURL(url, null); - final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url - final ContentScraper scraper = ContentScraper.parseResource(u, reqHeader); + final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH); // put the document title prop.putXML("title", scraper.getTitle()); diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index 6ba9e6627..c8251600e 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -75,35 +75,69 @@ public final class LoaderDispatcher { return (HashSet) this.supportedProtocols.clone(); } + public static byte[] toBytes(Response response) { + if (response == null) return null; + return response.getContent(); + } + + public Response load(final yacyURL url) throws IOException { + return load(url, true, false); + } + + public Response load(final yacyURL url, int cachePolicy) throws IOException { + return load(url, true, false, cachePolicy); + } + public Response load( final yacyURL url, final boolean forText, final boolean global + ) throws IOException { + return load(request(url, forText, global)); + } + + public Response load( + final yacyURL url, + final boolean forText, + final boolean global, + int cacheStratgy ) throws IOException { - - final Request centry = new Request( - sb.peers.mySeed().hash, - url, - "", - "", - new Date(), - new Date(), - (forText) ? - ((global) ? - sb.crawler.defaultTextSnippetGlobalProfile.handle() : - sb.crawler.defaultTextSnippetLocalProfile.handle()) - : - ((global) ? - sb.crawler.defaultMediaSnippetGlobalProfile.handle() : - sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile - 0, - 0, - 0); - - return load(centry); + return load(request(url, forText, global), cacheStratgy); + } + + public Request request( + final yacyURL url, + final boolean forText, + final boolean global + ) throws IOException { + return new Request( + sb.peers.mySeed().hash, + url, + "", + "", + new Date(), + new Date(), + (forText) ? + ((global) ? + sb.crawler.defaultTextSnippetGlobalProfile.handle() : + sb.crawler.defaultTextSnippetLocalProfile.handle()) + : + ((global) ? + sb.crawler.defaultMediaSnippetGlobalProfile.handle() : + sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile + 0, + 0, + 0); } public Response load(final Request request) throws IOException { + CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); + int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH; + if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy(); + return load(request, cacheStrategy); + } + + public Response load(final Request request, int cacheStrategy) throws IOException { // get the protocol of the next URL final String protocol = request.url().getProtocol(); final String host = request.url().getHost(); @@ -115,8 +149,7 @@ public final class LoaderDispatcher { // check if we have the page in the cache CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle()); - int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE; - if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) { + if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) { // we have passed a first test if caching is allowed // now see if there is a cache entry diff --git a/source/de/anomic/document/parser/html/ContentScraper.java b/source/de/anomic/document/parser/html/ContentScraper.java index 2a50f2213..629bcaa94 100644 --- a/source/de/anomic/document/parser/html/ContentScraper.java +++ b/source/de/anomic/document/parser/html/ContentScraper.java @@ -29,7 +29,6 @@ package de.anomic.document.parser.html; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; -import java.io.Reader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; @@ -44,11 +43,8 @@ import java.util.Properties; import javax.swing.event.EventListenerList; -import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.LoaderDispatcher; import de.anomic.document.parser.htmlParser; -import de.anomic.http.client.Client; -import de.anomic.http.metadata.HeaderFramework; -import de.anomic.http.metadata.RequestHeader; import de.anomic.kelondro.util.FileUtils; import de.anomic.server.serverCharBuffer; import de.anomic.yacy.yacyURL; @@ -511,25 +507,15 @@ public class ContentScraper extends AbstractScraper implements Scraper { return scraper; } - public static ContentScraper parseResource(final yacyURL location) throws IOException { + public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException { // load page - final RequestHeader reqHeader = new RequestHeader(); - reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent); - return parseResource(location, reqHeader); - } - - public static ContentScraper parseResource(final yacyURL location, final RequestHeader reqHeader) throws IOException { - final Reader pageReader = Client.wgetReader(location.toString(), reqHeader, 10000); - if (pageReader == null) throw new IOException("no response from url " + location.toString()); + byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy)); + if (page == null) throw new IOException("no response from url " + location.toString()); // scrape content final ContentScraper scraper = new ContentScraper(location); final Writer writer = new TransformerWriter(null, null, scraper, null, false); - try { - FileUtils.copy(pageReader, writer); - } finally { - pageReader.close(); - } + writer.write(new String(page, "UTF-8")); return scraper; } diff --git a/source/de/anomic/http/client/Client.java b/source/de/anomic/http/client/Client.java index 2a3ded4bb..6ba72f6cb 100644 --- a/source/de/anomic/http/client/Client.java +++ b/source/de/anomic/http/client/Client.java @@ -29,9 +29,6 @@ package de.anomic.http.client; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -77,7 +74,6 @@ import de.anomic.yacy.logging.Log; * */ public class Client { - /** * "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency." * (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html) @@ -746,9 +742,11 @@ public class Client { public static byte[] wget(final String uri) { return wget(uri, new RequestHeader(), 10000, null); } + public static byte[] wget(final String uri, final RequestHeader header, final int timeout) { return wget(uri, header, timeout, null); } + public static byte[] wget(final String uri, final RequestHeader header, final int timeout, final String vhost) { assert uri != null : "precondition violated: uri != null"; addHostHeader(header, vhost); @@ -769,28 +767,6 @@ public class Client { } return null; } - public static Reader wgetReader(final String uri) { - return wgetReader(uri, new RequestHeader(), 10000, null); - } - public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout) { - return wgetReader(uri, header, timeout, null); - } - public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout, final String vhost) { - assert uri != null : "precondition violated: uri != null"; - addHostHeader(header, vhost); - final Client client = new Client(timeout, header); - - // do the request - ResponseContainer response = null; - try { - response = client.GET(uri); - Charset charset = response.getResponseHeader().getCharSet(); - return new InputStreamReader(response.getDataAsStream(), charset); - } catch (final IOException e) { - Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage()); - } - return null; - } /** * adds a Host-header to the header if vhost is not null diff --git a/source/de/anomic/yacy/yacyRelease.java b/source/de/anomic/yacy/yacyRelease.java index 64946157a..bbccf7ea3 100644 --- a/source/de/anomic/yacy/yacyRelease.java +++ b/source/de/anomic/yacy/yacyRelease.java @@ -44,6 +44,7 @@ import java.util.Map; import java.util.SortedSet; import java.util.TreeSet; +import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.HTTPLoader; import de.anomic.document.parser.html.ContentScraper; import de.anomic.http.client.Client; @@ -231,7 +232,7 @@ public final class yacyRelease extends yacyVersion { // returns the version info if successful, null otherwise ContentScraper scraper; try { - scraper = ContentScraper.parseResource(location.getLocationURL()); + scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE); } catch (final IOException e) { return null; }