*) some improvements for extended html document charset support

- new class htmlFilterInputStream.java which allows to pre-analyze the html header to extract
     the charset meta data. This is only enabled for the crawler at the moment. Integration into 
     proxy needs more testing.     
   - adding eventlisterner interfaces to the htmlscraper to allow other classes to get informed
     about detected tags (used by the htmlFilterInputStream.java)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2624 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 33f0f703c0
commit 5c6251bced

@ -54,6 +54,8 @@ import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import javax.swing.event.EventListenerList;
import de.anomic.net.URL;
import de.anomic.server.serverCharBuffer;
@ -94,6 +96,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
//private String headline;
private List[] headlines;
private serverCharBuffer content;
private EventListenerList htmlFilterEventListeners = new EventListenerList();
private URL root;
@ -111,23 +114,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.content = new serverCharBuffer(1024);
}
public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException {
// // convert the content back to the old bytearray
// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset));
//
// // create a reader with the new charset
// serverCharBuffer newContent = new serverCharBuffer(this.content.length());
// try {
// InputStreamReader reader = new InputStreamReader(temp,newCharset);
// serverFileUtils.copy(reader, newContent);
// reader.close();
// } catch (IOException e) {
// // ignore this
// }
//
// this.content = newContent;
}
public void scrapeText(char[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
@ -172,12 +158,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
String name = tagopts.getProperty("name", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
return;
}
name = tagopts.getProperty("http-equiv", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
return;
} else {
name = tagopts.getProperty("http-equiv", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
}
}
}
if (tagname.equalsIgnoreCase("area")) {
@ -186,6 +171,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
String href = tagopts.getProperty("href", "");
if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
}
// fire event
fireScrapeTag0(tagname, tagopts);
}
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
@ -211,8 +199,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
if (h.length() > 0) headlines[3].add(h);
}
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024))
title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
}
// fire event
fireScrapeTag1(tagname, tagopts, text);
}
private static String cleanLine(String s) {
@ -365,16 +357,34 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + content.toString());
}
public void registerHtmlFilterEventListener(htmlFilterEventListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.add(htmlFilterEventListener.class, listener);
}
}
public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.remove(htmlFilterEventListener.class, listener);
}
}
void fireScrapeTag0(String tagname, Properties tagopts) {
Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
if (listeners[i]==htmlFilterEventListener.class) {
((htmlFilterEventListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
}
}
}
/*
public static void main(String[] args) {
try {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
scraper.scrapeText(test.getBytes());
System.out.println(new String(scraper.getText()));
} catch (MalformedURLException e) {}
}
*/
void fireScrapeTag1(String tagname, Properties tagopts, char[] text) {
Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
if (listeners[i]==htmlFilterEventListener.class) {
((htmlFilterEventListener)listeners[i+1]).scrapeTag1(tagname, tagopts, text);
}
}
}
}

@ -0,0 +1,9 @@
package de.anomic.htmlFilter;
import java.util.EventListener;
import java.util.Properties;
public interface htmlFilterEventListener extends EventListener {
public void scrapeTag0(String tagname, Properties tagopts);
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
}

@ -0,0 +1,99 @@
package de.anomic.htmlFilter;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Properties;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener {
private long preBufferSize = 143336;
private long preRead = 0;
private BufferedInputStream bufferedIn;
private String detectedCharset;
private int mode = 0;
private boolean charsetChanged = false;
private boolean endOfHead = false;
private Reader reader;
private Writer writer;
public htmlFilterInputStream(
InputStream inStream,
String inputStreamCharset,
URL rooturl,
htmlFilterTransformer transformer,
boolean passbyIfBinarySuspect
) throws UnsupportedEncodingException {
// create a input stream for buffereing
this.bufferedIn = new BufferedInputStream(inStream,(int)this.preBufferSize);
this.bufferedIn.mark((int)this.preBufferSize);
htmlFilterContentScraper scraper = new htmlFilterContentScraper(rooturl);
scraper.registerHtmlFilterEventListener(this);
this.reader = new InputStreamReader(this,inputStreamCharset);
this.writer = new htmlFilterWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
}
public void scrapeTag0(String tagname, Properties tagopts) {
if (tagname == null || tagname.length() == 0) return;
if (tagname.equalsIgnoreCase("meta")) {
if (tagopts.containsKey("http-equiv")) {
String value = tagopts.getProperty("http-equiv");
if (value.equalsIgnoreCase("Content-Type")) {
String contentType = tagopts.getProperty("content");
this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
this.charsetChanged = true;
}
}
}
}
public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
if (tagname == null || tagname.length() == 0) return;
if (tagname.equalsIgnoreCase("head")) {
this.endOfHead = true;
}
}
public String detectCharset() throws IOException {
// loop until we have detected the header element or the charset data
int c;
while ((c = this.reader.read())!= -1) {
this.writer.write(c);
}
// free writer
this.writer = null;
// don't close writer here, otherwise it will shutdown our source stream
return (this.charsetChanged) ? this.detectedCharset : null;
}
public int read() throws IOException {
// mode 0 is called from within the detectCharset function
if (this.mode == 0) {
if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) {
this.mode++;
this.bufferedIn.reset();
return -1;
}
this.preRead++;
}
return this.bufferedIn.read();
}
}

@ -55,4 +55,8 @@ public interface htmlFilterScraper {
public void scrapeTag1(String tagname, Properties tagopts, char[] text);
public void close();
public void registerHtmlFilterEventListener(htmlFilterEventListener listener);
public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener);
}

@ -50,10 +50,9 @@ import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URI;
import de.anomic.net.URL;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
@ -70,8 +69,11 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
import org.apache.commons.pool.impl.GenericObjectPool;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterInputStream;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.net.URL;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.parser.ParserInfo;
@ -556,29 +558,7 @@ public final class plasmaParser {
if (theParser != null) {
return theParser.parse(location, mimeType,documentCharset,sourceFile);
} else if (realtimeParsableMimeTypesContains(mimeType)) {
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
// set the charset if known
/*if (charset != null) {
try {
scraper.setCharset(charset);
} catch (UnsupportedCharsetException e) {
serverLog.logWarning("PARSER", "parseSource2: unknown or unsupported charset '" + charset + "'");
return null;
}
}*/
htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false);
serverFileUtils.copy(sourceFile, documentCharset, writer);
writer.close();
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
this.theLogger.logInfo("Binary data found in URL " + location);
return null;
}
return transformScraper(location, mimeType, documentCharset, scraper);
return parseHtml(location, mimeType, documentCharset, sourceFile);
} else {
serverLog.logWarning("PARSER", "parseSource2: wrong mime type");
return null;
@ -594,16 +574,36 @@ public final class plasmaParser {
}
}
private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException {
// ...otherwise we make a scraper and transformer
FileInputStream fileIn = new FileInputStream(sourceFile);
htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
String charset = htmlFilter.detectCharset();
if (charset == null) {
charset = documentCharset;
}
if (!documentCharset.equalsIgnoreCase(charset)) {
this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'");
}
// parsing the content
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false);
serverFileUtils.copy(htmlFilter, writer, charset);
writer.close();
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
this.theLogger.logInfo("Binary data found in URL " + location);
return null;
}
return transformScraper(location, mimeType, documentCharset, scraper);
}
public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
try {
if (scraper.getMetas().containsKey("content-type")) {
String newCharset = (String) scraper.getMetas().get("content-type");
if (!charSet.equals(newCharset)) {
// TODO: transformation of content needed
this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'");
}
}
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];

Loading…
Cancel
Save