replaced wget-requests with caching requests

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6242 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent c6c97f23ad
commit 634a01a9a4

@ -3,9 +3,8 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Set;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.CrawlProfile;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
@ -43,9 +42,7 @@ public class getpageinfo_p {
if (actions.indexOf("title")>=0) {
try {
final yacyURL u = new yacyURL(url, null);
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url
final ContentScraper scraper = ContentScraper.parseResource(u, reqHeader);
final ContentScraper scraper = ContentScraper.parseResource(sb.loader, u, CrawlProfile.CACHE_STRATEGY_IFFRESH);
// put the document title
prop.putXML("title", scraper.getTitle());

@ -75,35 +75,69 @@ public final class LoaderDispatcher {
return (HashSet<String>) this.supportedProtocols.clone();
}
public static byte[] toBytes(Response response) {
if (response == null) return null;
return response.getContent();
}
public Response load(final yacyURL url) throws IOException {
return load(url, true, false);
}
public Response load(final yacyURL url, int cachePolicy) throws IOException {
return load(url, true, false, cachePolicy);
}
public Response load(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
return load(request(url, forText, global));
}
public Response load(
final yacyURL url,
final boolean forText,
final boolean global,
int cacheStratgy
) throws IOException {
final Request centry = new Request(
sb.peers.mySeed().hash,
url,
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() :
sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
return load(centry);
return load(request(url, forText, global), cacheStratgy);
}
public Request request(
final yacyURL url,
final boolean forText,
final boolean global
) throws IOException {
return new Request(
sb.peers.mySeed().hash,
url,
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() :
sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0);
}
public Response load(final Request request) throws IOException {
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_IFFRESH;
if (crawlProfile != null) cacheStrategy = crawlProfile.cacheStrategy();
return load(request, cacheStrategy);
}
public Response load(final Request request, int cacheStrategy) throws IOException {
// get the protocol of the next URL
final String protocol = request.url().getProtocol();
final String host = request.url().getHost();
@ -115,8 +149,7 @@ public final class LoaderDispatcher {
// check if we have the page in the cache
CrawlProfile.entry crawlProfile = sb.crawler.profilesActiveCrawls.getEntry(request.profileHandle());
int cacheStrategy = CrawlProfile.CACHE_STRATEGY_NOCACHE;
if (crawlProfile != null && (cacheStrategy = crawlProfile.cacheStrategy()) != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
if (crawlProfile != null && cacheStrategy != CrawlProfile.CACHE_STRATEGY_NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry

@ -29,7 +29,6 @@ package de.anomic.document.parser.html;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.net.MalformedURLException;
@ -44,11 +43,8 @@ import java.util.Properties;
import javax.swing.event.EventListenerList;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.document.parser.htmlParser;
import de.anomic.http.client.Client;
import de.anomic.http.metadata.HeaderFramework;
import de.anomic.http.metadata.RequestHeader;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.server.serverCharBuffer;
import de.anomic.yacy.yacyURL;
@ -511,25 +507,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return scraper;
}
public static ContentScraper parseResource(final yacyURL location) throws IOException {
public static ContentScraper parseResource(final LoaderDispatcher loader, final yacyURL location, int cachePolicy) throws IOException {
// load page
final RequestHeader reqHeader = new RequestHeader();
reqHeader.put(HeaderFramework.USER_AGENT, HTTPLoader.crawlerUserAgent);
return parseResource(location, reqHeader);
}
public static ContentScraper parseResource(final yacyURL location, final RequestHeader reqHeader) throws IOException {
final Reader pageReader = Client.wgetReader(location.toString(), reqHeader, 10000);
if (pageReader == null) throw new IOException("no response from url " + location.toString());
byte[] page = LoaderDispatcher.toBytes(loader.load(location, cachePolicy));
if (page == null) throw new IOException("no response from url " + location.toString());
// scrape content
final ContentScraper scraper = new ContentScraper(location);
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(pageReader, writer);
} finally {
pageReader.close();
}
writer.write(new String(page, "UTF-8"));
return scraper;
}

@ -29,9 +29,6 @@ package de.anomic.http.client;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@ -77,7 +74,6 @@ import de.anomic.yacy.logging.Log;
*
*/
public class Client {
/**
* "the HttpClient instance and connection manager should be shared among all threads for maximum efficiency."
* (Concurrent execution of HTTP methods, http://hc.apache.org/httpclient-3.x/performance.html)
@ -746,9 +742,11 @@ public class Client {
public static byte[] wget(final String uri) {
return wget(uri, new RequestHeader(), 10000, null);
}
public static byte[] wget(final String uri, final RequestHeader header, final int timeout) {
return wget(uri, header, timeout, null);
}
public static byte[] wget(final String uri, final RequestHeader header, final int timeout, final String vhost) {
assert uri != null : "precondition violated: uri != null";
addHostHeader(header, vhost);
@ -769,28 +767,6 @@ public class Client {
}
return null;
}
public static Reader wgetReader(final String uri) {
return wgetReader(uri, new RequestHeader(), 10000, null);
}
public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout) {
return wgetReader(uri, header, timeout, null);
}
public static Reader wgetReader(final String uri, final RequestHeader header, final int timeout, final String vhost) {
assert uri != null : "precondition violated: uri != null";
addHostHeader(header, vhost);
final Client client = new Client(timeout, header);
// do the request
ResponseContainer response = null;
try {
response = client.GET(uri);
Charset charset = response.getResponseHeader().getCharSet();
return new InputStreamReader(response.getDataAsStream(), charset);
} catch (final IOException e) {
Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage());
}
return null;
}
/**
* adds a Host-header to the header if vhost is not null

@ -44,6 +44,7 @@ import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.document.parser.html.ContentScraper;
import de.anomic.http.client.Client;
@ -231,7 +232,7 @@ public final class yacyRelease extends yacyVersion {
// returns the version info if successful, null otherwise
ContentScraper scraper;
try {
scraper = ContentScraper.parseResource(location.getLocationURL());
scraper = ContentScraper.parseResource(Switchboard.getSwitchboard().loader, location.getLocationURL(), CrawlProfile.CACHE_STRATEGY_NOCACHE);
} catch (final IOException e) {
return null;
}

Loading…
Cancel
Save