* fix missing charset handling in getpageinfo_p

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5811 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 16 years ago
parent bd5f4c78d8
commit dd6b5005ff

@ -1,15 +1,11 @@
import java.io.IOException; import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Set; import java.util.Set;
import de.anomic.crawler.HTTPLoader; import de.anomic.crawler.HTTPLoader;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpClient;
import de.anomic.http.httpRequestHeader; import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -48,15 +44,7 @@ public class getpageinfo_p {
final yacyURL u = new yacyURL(url, null); final yacyURL u = new yacyURL(url, null);
final httpRequestHeader reqHeader = new httpRequestHeader(); final httpRequestHeader reqHeader = new httpRequestHeader();
reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.yacyUserAgent); // do not set the crawler user agent, because this page was loaded by manual entering of the url
final byte[] r = httpClient.wget(u.toString(), reqHeader, 5000); final htmlFilterContentScraper scraper = htmlFilterContentScraper.parseResource(u, reqHeader);
if (r == null) return prop;
final String contentString=new String(r);
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(u);
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
final Writer writer = new htmlFilterWriter(null,null,scraper,null,false);
FileUtils.copy(contentString,writer);
writer.close();
// put the document title // put the document title
prop.putXML("title", scraper.getTitle()); prop.putXML("title", scraper.getTitle());

@ -29,6 +29,7 @@ package de.anomic.htmlFilter;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.io.Writer; import java.io.Writer;
import java.net.MalformedURLException; import java.net.MalformedURLException;
@ -503,13 +504,21 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// load page // load page
final httpRequestHeader reqHeader = new httpRequestHeader(); final httpRequestHeader reqHeader = new httpRequestHeader();
reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent); reqHeader.put(httpRequestHeader.USER_AGENT, HTTPLoader.crawlerUserAgent);
final byte[] page = httpClient.wget(location.toString(), reqHeader, 10000); return parseResource(location, reqHeader);
if (page == null) throw new IOException("no response from url " + location.toString()); }
public static htmlFilterContentScraper parseResource(final yacyURL location, final httpRequestHeader reqHeader) throws IOException {
final Reader pageReader = httpClient.wgetReader(location.toString(), reqHeader, 10000);
if (pageReader == null) throw new IOException("no response from url " + location.toString());
// scrape content // scrape content
final htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); final htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
final Writer writer = new htmlFilterWriter(null, null, scraper, null, false); final Writer writer = new htmlFilterWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName("UTF-8")); try {
FileUtils.copy(pageReader, writer);
} finally {
pageReader.close();
}
return scraper; return scraper;
} }

@ -29,6 +29,9 @@ package de.anomic.http;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
@ -765,6 +768,28 @@ public class httpClient {
} }
return null; return null;
} }
public static Reader wgetReader(final String uri) {
return wgetReader(uri, new httpRequestHeader(), 10000, null);
}
public static Reader wgetReader(final String uri, final httpRequestHeader header, final int timeout) {
return wgetReader(uri, header, timeout, null);
}
public static Reader wgetReader(final String uri, final httpRequestHeader header, final int timeout, final String vhost) {
assert uri != null : "precondition violated: uri != null";
addHostHeader(header, vhost);
final httpClient client = new httpClient(timeout, header);
// do the request
httpResponse response = null;
try {
response = client.GET(uri);
Charset charset = response.getResponseHeader().getCharSet();
return new InputStreamReader(response.getDataAsStream(), charset);
} catch (final IOException e) {
Log.logWarning("HTTPC", "wgetReader(" + uri + ") failed: " + e.getMessage());
}
return null;
}
/** /**
* adds a Host-header to the header if vhost is not null * adds a Host-header to the header if vhost is not null

Loading…
Cancel
Save