From 5c6251bcedf9150f111e420f255d8cab95f9386f Mon Sep 17 00:00:00 2001 From: theli Date: Mon, 18 Sep 2006 15:36:04 +0000 Subject: [PATCH] *) some improvements for extended html document charset support - new class htmlFilterInputStream.java which allows to pre-analyze the html header to extract the charset meta data. This is only enabled for the crawler at the moment. Integration into proxy needs more testing. - adding eventlisterner interfaces to the htmlscraper to allow other classes to get informed about detected tags (used by the htmlFilterInputStream.java) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2624 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterContentScraper.java | 80 ++++++++------- .../htmlFilter/htmlFilterEventListener.java | 9 ++ .../htmlFilter/htmlFilterInputStream.java | 99 +++++++++++++++++++ .../anomic/htmlFilter/htmlFilterScraper.java | 4 + source/de/anomic/plasma/plasmaParser.java | 66 ++++++------- 5 files changed, 190 insertions(+), 68 deletions(-) create mode 100644 source/de/anomic/htmlFilter/htmlFilterEventListener.java create mode 100644 source/de/anomic/htmlFilter/htmlFilterInputStream.java diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 1883d9129..5f5209d61 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -54,6 +54,8 @@ import java.util.Map; import java.util.Properties; import java.util.TreeSet; +import javax.swing.event.EventListenerList; + import de.anomic.net.URL; import de.anomic.server.serverCharBuffer; @@ -94,6 +96,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen //private String headline; private List[] headlines; private serverCharBuffer content; + private EventListenerList htmlFilterEventListeners = new EventListenerList(); private URL root; @@ -111,23 +114,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen this.content = new serverCharBuffer(1024); } - public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException { -// // convert the content back to the old bytearray -// ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset)); -// -// // create a reader with the new charset -// serverCharBuffer newContent = new serverCharBuffer(this.content.length()); -// try { -// InputStreamReader reader = new InputStreamReader(temp,newCharset); -// serverFileUtils.copy(reader, newContent); -// reader.close(); -// } catch (IOException e) { -// // ignore this -// } -// -// this.content = newContent; - } - public void scrapeText(char[] newtext) { // System.out.println("SCRAPE: " + new String(newtext)); if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32); @@ -172,12 +158,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen String name = tagopts.getProperty("name", ""); if (name.length() > 0) { metas.put(name.toLowerCase(), tagopts.getProperty("content","")); - return; - } - name = tagopts.getProperty("http-equiv", ""); - if (name.length() > 0) { - metas.put(name.toLowerCase(), tagopts.getProperty("content","")); - return; + } else { + name = tagopts.getProperty("http-equiv", ""); + if (name.length() > 0) { + metas.put(name.toLowerCase(), tagopts.getProperty("content","")); + } } } if (tagname.equalsIgnoreCase("area")) { @@ -186,6 +171,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen String href = tagopts.getProperty("href", ""); if (href.length() > 0) anchors.put(absolutePath(href), areatitle); } + + // fire event + fireScrapeTag0(tagname, tagopts); } public void scrapeTag1(String tagname, Properties tagopts, char[] text) { @@ -211,8 +199,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); if (h.length() > 0) headlines[3].add(h); } - if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) - title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { + title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString()); + } + + // fire event + fireScrapeTag1(tagname, tagopts, text); } private static String cleanLine(String s) { @@ -365,16 +357,34 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen System.out.println("METAS :" + metas.toString()); System.out.println("TEXT :" + content.toString()); } + + public void registerHtmlFilterEventListener(htmlFilterEventListener listener) { + if (listener != null) { + this.htmlFilterEventListeners.add(htmlFilterEventListener.class, listener); + } + } + + public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener) { + if (listener != null) { + this.htmlFilterEventListeners.remove(htmlFilterEventListener.class, listener); + } + } + void fireScrapeTag0(String tagname, Properties tagopts) { + Object[] listeners = this.htmlFilterEventListeners.getListenerList(); + for (int i=0; i= this.preBufferSize) { + this.mode++; + this.bufferedIn.reset(); + return -1; + } + this.preRead++; + } + return this.bufferedIn.read(); + } + + +} diff --git a/source/de/anomic/htmlFilter/htmlFilterScraper.java b/source/de/anomic/htmlFilter/htmlFilterScraper.java index ad054233d..678eaaf4c 100644 --- a/source/de/anomic/htmlFilter/htmlFilterScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java @@ -55,4 +55,8 @@ public interface htmlFilterScraper { public void scrapeTag1(String tagname, Properties tagopts, char[] text); public void close(); + + public void registerHtmlFilterEventListener(htmlFilterEventListener listener); + + public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener); } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index 3f36f0a80..9c22a93ca 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -50,10 +50,9 @@ import java.io.FileFilter; import java.io.FileInputStream; import java.io.FilenameFilter; import java.io.IOException; +import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URI; - -import de.anomic.net.URL; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -70,8 +69,11 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool; import org.apache.commons.pool.impl.GenericObjectPool; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.htmlFilter.htmlFilterInputStream; import de.anomic.htmlFilter.htmlFilterWriter; +import de.anomic.http.httpHeader; import de.anomic.http.httpc; +import de.anomic.net.URL; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.plasma.parser.ParserInfo; @@ -556,29 +558,7 @@ public final class plasmaParser { if (theParser != null) { return theParser.parse(location, mimeType,documentCharset,sourceFile); } else if (realtimeParsableMimeTypesContains(mimeType)) { - // ...otherwise we make a scraper and transformer - htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); - - // set the charset if known - /*if (charset != null) { - try { - scraper.setCharset(charset); - } catch (UnsupportedCharsetException e) { - serverLog.logWarning("PARSER", "parseSource2: unknown or unsupported charset '" + charset + "'"); - return null; - } - }*/ - htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false); - serverFileUtils.copy(sourceFile, documentCharset, writer); - writer.close(); - //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); - //serverFileUtils.copy(sourceFile, hfos); - //hfos.close(); - if (writer.binarySuspect()) { - this.theLogger.logInfo("Binary data found in URL " + location); - return null; - } - return transformScraper(location, mimeType, documentCharset, scraper); + return parseHtml(location, mimeType, documentCharset, sourceFile); } else { serverLog.logWarning("PARSER", "parseSource2: wrong mime type"); return null; @@ -594,16 +574,36 @@ public final class plasmaParser { } } + private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException { + + // ...otherwise we make a scraper and transformer + FileInputStream fileIn = new FileInputStream(sourceFile); + htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false); + String charset = htmlFilter.detectCharset(); + if (charset == null) { + charset = documentCharset; + } + if (!documentCharset.equalsIgnoreCase(charset)) { + this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'"); + } + + // parsing the content + htmlFilterContentScraper scraper = new htmlFilterContentScraper(location); + htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false); + serverFileUtils.copy(htmlFilter, writer, charset); + writer.close(); + //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); + //serverFileUtils.copy(sourceFile, hfos); + //hfos.close(); + if (writer.binarySuspect()) { + this.theLogger.logInfo("Binary data found in URL " + location); + return null; + } + return transformScraper(location, mimeType, documentCharset, scraper); + } + public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) { try { - if (scraper.getMetas().containsKey("content-type")) { - String newCharset = (String) scraper.getMetas().get("content-type"); - if (!charSet.equals(newCharset)) { - // TODO: transformation of content needed - this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'"); - } - } - String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length]; int p = 0; for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];