diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index 0e3a98ca5..cc5e9ccb8 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -1,15 +1,11 @@ import java.io.IOException; -import java.io.Writer; import java.net.MalformedURLException; import java.util.Set; import de.anomic.crawler.HTTPLoader; import de.anomic.htmlFilter.htmlFilterContentScraper; -import de.anomic.htmlFilter.htmlFilterWriter; -import de.anomic.http.httpClient; import de.anomic.http.httpRequestHeader; -import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -48,15 +44,7 @@ public class getpageinfo_p { final yacyURL u = new yacyURL(url, null); final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url - final byte[] r = httpClient.wget(u.toString(), reqHeader, 5000); - if (r == null) return prop; - final String contentString=new String(r); - - final htmlFilterContentScraper scraper = new htmlFilterContentScraper(u); - //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - final Writer writer = new htmlFilterWriter(null,null,scraper,null,false); - FileUtils.copy(contentString,writer); - writer.close(); + final htmlFilterContentScraper scraper = htmlFilterContentScraper.parseResource(u, reqHeader); // put the document title prop.putXML("title", scraper.getTitle()); diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index fab9809d1..be114593d 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -29,6 +29,7 @@ package de.anomic.htmlFilter; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; +import java.io.Reader; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.net.MalformedURLException; @@ -503,13 +504,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen // load page final httpRequestHeader reqHeader = new httpRequestHeader(); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); - final byte[] page = httpClient.wget(location.toString(), reqHeader, 10000); - if (page == null) throw new IOException("no response from url " + location.toString()); + return parseResource(location, reqHeader); + } + + public static htmlFilterContentScraper parseResource(final yacyURL location, final httpRequestHeader reqHeader) throws IOException { + final Reader pageReader = httpClient.wgetReader(location.toString(), reqHeader, 10000); + if (pageReader == null) throw new IOException("no response from url " + location.toString()); // scrape content final htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); final Writer writer = new htmlFilterWriter(null, null, scraper, null, false); - FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName("UTF-8")); + try { + FileUtils.copy(pageReader, writer); + } finally { + pageReader.close(); + } return scraper; } diff --git a/source/de/anomic/http/httpClient.java b/source/de/anomic/http/httpClient.java index 1f96f1bae..c02050645 100644 --- a/source/de/anomic/http/httpClient.java +++ b/source/de/anomic/http/httpClient.java @@ -29,6 +29,9 @@ package de.anomic.http; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -765,6 +768,28 @@ public class httpClient { } return null; } + public static Reader wgetReader(final String uri) { + return wgetReader(uri, new httpRequestHeader(), 10000, null); + } + public static Reader wgetReader(final String uri, final httpRequestHeader header, final int timeout) { + return wgetReader(uri, header, timeout, null); + } + public static Reader wgetReader(final String uri, final httpRequestHeader header, final int timeout, final String vhost) { + assert uri != null : "precondition violated: uri != null"; + addHostHeader(header, vhost); + final httpClient client = new httpClient(timeout, header); + + // do the request + httpResponse response = null; + try { + response = client.GET(uri); + Charset charset = response.getResponseHeader().getCharSet(); + return new InputStreamReader(response.getDataAsStream(), charset); + } catch (final IOException e) { + Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage()); + } + return null; + } /** * adds a Host-header to the header if vhost is not null