- new class htmlFilterInputStream.java which allows to pre-analyze the html header to extract the charset meta data. This is only enabled for the crawler at the moment. Integration into proxy needs more testing. - adding eventlisterner interfaces to the htmlscraper to allow other classes to get informed about detected tags (used by the htmlFilterInputStream.java) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2624 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
33f0f703c0
commit
5c6251bced
@ -0,0 +1,9 @@
|
||||
package de.anomic.htmlFilter;
|
||||
|
||||
import java.util.EventListener;
|
||||
import java.util.Properties;
|
||||
|
||||
public interface htmlFilterEventListener extends EventListener {
|
||||
public void scrapeTag0(String tagname, Properties tagopts);
|
||||
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
|
||||
}
|
@ -0,0 +1,99 @@
|
||||
package de.anomic.htmlFilter;
|
||||
|
||||
import java.io.BufferedInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.util.Properties;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.net.URL;
|
||||
|
||||
|
||||
public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener {
|
||||
|
||||
private long preBufferSize = 143336;
|
||||
private long preRead = 0;
|
||||
private BufferedInputStream bufferedIn;
|
||||
|
||||
private String detectedCharset;
|
||||
private int mode = 0;
|
||||
private boolean charsetChanged = false;
|
||||
private boolean endOfHead = false;
|
||||
|
||||
private Reader reader;
|
||||
private Writer writer;
|
||||
|
||||
public htmlFilterInputStream(
|
||||
InputStream inStream,
|
||||
String inputStreamCharset,
|
||||
URL rooturl,
|
||||
htmlFilterTransformer transformer,
|
||||
boolean passbyIfBinarySuspect
|
||||
) throws UnsupportedEncodingException {
|
||||
// create a input stream for buffereing
|
||||
this.bufferedIn = new BufferedInputStream(inStream,(int)this.preBufferSize);
|
||||
this.bufferedIn.mark((int)this.preBufferSize);
|
||||
|
||||
htmlFilterContentScraper scraper = new htmlFilterContentScraper(rooturl);
|
||||
scraper.registerHtmlFilterEventListener(this);
|
||||
|
||||
this.reader = new InputStreamReader(this,inputStreamCharset);
|
||||
this.writer = new htmlFilterWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
|
||||
}
|
||||
|
||||
public void scrapeTag0(String tagname, Properties tagopts) {
|
||||
if (tagname == null || tagname.length() == 0) return;
|
||||
|
||||
if (tagname.equalsIgnoreCase("meta")) {
|
||||
if (tagopts.containsKey("http-equiv")) {
|
||||
String value = tagopts.getProperty("http-equiv");
|
||||
if (value.equalsIgnoreCase("Content-Type")) {
|
||||
String contentType = tagopts.getProperty("content");
|
||||
this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
|
||||
this.charsetChanged = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
|
||||
if (tagname == null || tagname.length() == 0) return;
|
||||
|
||||
if (tagname.equalsIgnoreCase("head")) {
|
||||
this.endOfHead = true;
|
||||
}
|
||||
}
|
||||
|
||||
public String detectCharset() throws IOException {
|
||||
// loop until we have detected the header element or the charset data
|
||||
int c;
|
||||
while ((c = this.reader.read())!= -1) {
|
||||
this.writer.write(c);
|
||||
}
|
||||
|
||||
// free writer
|
||||
this.writer = null;
|
||||
// don't close writer here, otherwise it will shutdown our source stream
|
||||
|
||||
return (this.charsetChanged) ? this.detectedCharset : null;
|
||||
}
|
||||
|
||||
public int read() throws IOException {
|
||||
// mode 0 is called from within the detectCharset function
|
||||
if (this.mode == 0) {
|
||||
if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) {
|
||||
this.mode++;
|
||||
this.bufferedIn.reset();
|
||||
return -1;
|
||||
}
|
||||
this.preRead++;
|
||||
}
|
||||
return this.bufferedIn.read();
|
||||
}
|
||||
|
||||
|
||||
}
|
Loading…
Reference in new issue