From 5c6251bcedf9150f111e420f255d8cab95f9386f Mon Sep 17 00:00:00 2001
From: theli <theli@6c8d7289-2bf4-0310-a012-ef5d649a1542>
Date: Mon, 18 Sep 2006 15:36:04 +0000
Subject: [PATCH] *) some improvements for extended html document charset
 support    - new class htmlFilterInputStream.java which allows to pre-analyze
 the html header to extract      the charset meta data. This is only enabled
 for the crawler at the moment. Integration into      proxy needs more
 testing.    - adding eventlisterner interfaces to the htmlscraper to allow
 other classes to get informed      about detected tags (used by the
 htmlFilterInputStream.java)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2624 6c8d7289-2bf4-0310-a012-ef5d649a1542
---
 .../htmlFilter/htmlFilterContentScraper.java  | 80 ++++++++-------
 .../htmlFilter/htmlFilterEventListener.java   |  9 ++
 .../htmlFilter/htmlFilterInputStream.java     | 99 +++++++++++++++++++
 .../anomic/htmlFilter/htmlFilterScraper.java  |  4 +
 source/de/anomic/plasma/plasmaParser.java     | 66 ++++++-------
 5 files changed, 190 insertions(+), 68 deletions(-)
 create mode 100644 source/de/anomic/htmlFilter/htmlFilterEventListener.java
 create mode 100644 source/de/anomic/htmlFilter/htmlFilterInputStream.java

diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
index 1883d9129..5f5209d61 100644
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@@ -54,6 +54,8 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.TreeSet;
 
+import javax.swing.event.EventListenerList;
+
 import de.anomic.net.URL;
 import de.anomic.server.serverCharBuffer;
 
@@ -94,6 +96,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
     //private String headline;
     private List[] headlines;
     private serverCharBuffer content;
+    private EventListenerList htmlFilterEventListeners = new EventListenerList();
     
     private URL root;
 
@@ -111,23 +114,6 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         this.content = new serverCharBuffer(1024);
     }
     
-    public void transformCharset(String oldCharset, String newCharset) throws UnsupportedEncodingException {
-//        // convert the content back to the old bytearray
-//        ByteArrayInputStream temp = new ByteArrayInputStream(new String(this.content.getChars()).getBytes(oldCharset));
-//        
-//        // create a reader with the new charset
-//        serverCharBuffer newContent = new serverCharBuffer(this.content.length());
-//        try {
-//            InputStreamReader reader = new InputStreamReader(temp,newCharset);                               
-//            serverFileUtils.copy(reader, newContent);
-//            reader.close();
-//        } catch (IOException e) {
-//            // ignore this
-//        }
-//        
-//        this.content = newContent;        
-    }
-    
     public void scrapeText(char[] newtext) {
         // System.out.println("SCRAPE: " + new String(newtext));
         if ((content.length() != 0) && (content.charAt(content.length() - 1) != 32)) content.append(32);
@@ -172,12 +158,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
             String name = tagopts.getProperty("name", "");
             if (name.length() > 0) {
                 metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
-                return;
-            }
-            name = tagopts.getProperty("http-equiv", "");
-            if (name.length() > 0) {
-                metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
-                return;
+            } else {
+                name = tagopts.getProperty("http-equiv", "");
+                if (name.length() > 0) {
+                    metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
+                }
             }
         }
         if (tagname.equalsIgnoreCase("area")) {
@@ -186,6 +171,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
             String href  = tagopts.getProperty("href", "");
             if (href.length() > 0) anchors.put(absolutePath(href), areatitle);
         }
+        
+        // fire event
+        fireScrapeTag0(tagname, tagopts);
     }
 
     public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
@@ -211,8 +199,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
             h = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
             if (h.length() > 0) headlines[3].add(h);
         }
-        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) 
-            title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());        
+        if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
+            title = cleanLine(super.stripAll(new serverCharBuffer(text)).toString());
+        }
+
+        // fire event
+        fireScrapeTag1(tagname, tagopts, text);
     }
 
     private static String cleanLine(String s) {
@@ -365,16 +357,34 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
         System.out.println("METAS    :" + metas.toString());
         System.out.println("TEXT     :" + content.toString());
     }
+
+    public void registerHtmlFilterEventListener(htmlFilterEventListener listener) {
+        if (listener != null) {
+            this.htmlFilterEventListeners.add(htmlFilterEventListener.class, listener);
+        }        
+    }
+
+    public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener) {
+        if (listener != null) {
+            this.htmlFilterEventListeners.remove(htmlFilterEventListener.class, listener);
+        }        
+    }
     
+    void fireScrapeTag0(String tagname, Properties tagopts) {
+        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
+        for (int i=0; i<listeners.length; i+=2) {
+            if (listeners[i]==htmlFilterEventListener.class) {
+                    ((htmlFilterEventListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
+            }
+        }
+    }    
     
-    
-/*
-    public static void main(String[] args) {  
-        try {
-            htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
-            scraper.scrapeText(test.getBytes());
-            System.out.println(new String(scraper.getText()));
-        } catch (MalformedURLException e) {}
-    }
-*/
+    void fireScrapeTag1(String tagname, Properties tagopts, char[] text) {
+        Object[] listeners = this.htmlFilterEventListeners.getListenerList();
+        for (int i=0; i<listeners.length; i+=2) {
+            if (listeners[i]==htmlFilterEventListener.class) {
+                    ((htmlFilterEventListener)listeners[i+1]).scrapeTag1(tagname, tagopts, text);
+            }
+        }
+    }      
 }
\ No newline at end of file
diff --git a/source/de/anomic/htmlFilter/htmlFilterEventListener.java b/source/de/anomic/htmlFilter/htmlFilterEventListener.java
new file mode 100644
index 000000000..ceb26f856
--- /dev/null
+++ b/source/de/anomic/htmlFilter/htmlFilterEventListener.java
@@ -0,0 +1,9 @@
+package de.anomic.htmlFilter;
+
+import java.util.EventListener;
+import java.util.Properties;
+
+public interface htmlFilterEventListener extends EventListener {
+    public void scrapeTag0(String tagname, Properties tagopts);
+    public void scrapeTag1(String tagname, Properties tagopts, char[] text);
+}
diff --git a/source/de/anomic/htmlFilter/htmlFilterInputStream.java b/source/de/anomic/htmlFilter/htmlFilterInputStream.java
new file mode 100644
index 000000000..12ddd2ed6
--- /dev/null
+++ b/source/de/anomic/htmlFilter/htmlFilterInputStream.java
@@ -0,0 +1,99 @@
+package de.anomic.htmlFilter;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.io.UnsupportedEncodingException;
+import java.io.Writer;
+import java.util.Properties;
+
+import de.anomic.http.httpHeader;
+import de.anomic.net.URL;
+
+
+public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener {
+    
+    private long preBufferSize = 143336;
+    private long preRead = 0;
+    private BufferedInputStream bufferedIn;
+
+    private String detectedCharset;
+    private int mode = 0;
+    private boolean charsetChanged = false;
+    private boolean endOfHead = false;
+    
+    private Reader reader;
+    private Writer writer;
+    
+    public htmlFilterInputStream(
+            InputStream inStream,
+            String inputStreamCharset,
+            URL rooturl,
+            htmlFilterTransformer transformer,
+            boolean passbyIfBinarySuspect
+    ) throws UnsupportedEncodingException {
+        // create a input stream for buffereing
+        this.bufferedIn = new BufferedInputStream(inStream,(int)this.preBufferSize);
+        this.bufferedIn.mark((int)this.preBufferSize);
+        
+        htmlFilterContentScraper scraper = new htmlFilterContentScraper(rooturl);
+        scraper.registerHtmlFilterEventListener(this);
+        
+        this.reader = new InputStreamReader(this,inputStreamCharset); 
+        this.writer = new htmlFilterWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
+    }
+
+    public void scrapeTag0(String tagname, Properties tagopts) {
+        if (tagname == null || tagname.length() == 0) return;
+        
+        if (tagname.equalsIgnoreCase("meta")) {
+            if (tagopts.containsKey("http-equiv")) {
+                String value = tagopts.getProperty("http-equiv");
+                if (value.equalsIgnoreCase("Content-Type")) {
+                    String contentType = tagopts.getProperty("content");
+                    this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
+                    this.charsetChanged = true;
+                }
+            }
+        }
+    }
+
+    public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
+        if (tagname == null || tagname.length() == 0) return;
+        
+        if (tagname.equalsIgnoreCase("head")) {
+            this.endOfHead = true;
+        }
+    }
+    
+    public String detectCharset() throws IOException {
+        // loop until we have detected the header element or the charset data
+        int c;
+        while ((c = this.reader.read())!= -1) {
+            this.writer.write(c);
+        }
+        
+        // free writer
+        this.writer = null;        
+        // don't close writer here, otherwise it will shutdown our source stream 
+        
+        return (this.charsetChanged) ? this.detectedCharset : null;
+    }
+
+    public int read() throws IOException {
+        // mode 0 is called from within the detectCharset function
+        if (this.mode == 0) {      
+            if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) {
+                this.mode++;
+                this.bufferedIn.reset();
+                return -1;            
+            }
+            this.preRead++;            
+        }        
+        return this.bufferedIn.read();
+    }
+
+    
+}
diff --git a/source/de/anomic/htmlFilter/htmlFilterScraper.java b/source/de/anomic/htmlFilter/htmlFilterScraper.java
index ad054233d..678eaaf4c 100644
--- a/source/de/anomic/htmlFilter/htmlFilterScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@@ -55,4 +55,8 @@ public interface htmlFilterScraper {
     public void scrapeTag1(String tagname, Properties tagopts, char[] text);
     
     public void close();
+    
+    public void registerHtmlFilterEventListener(htmlFilterEventListener listener);
+    
+    public void deregisterHtmlFilterEventListener(htmlFilterEventListener listener);
 }
diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java
index 3f36f0a80..9c22a93ca 100644
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@@ -50,10 +50,9 @@ import java.io.FileFilter;
 import java.io.FileInputStream;
 import java.io.FilenameFilter;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URI;
-
-import de.anomic.net.URL;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -70,8 +69,11 @@ import org.apache.commons.pool.impl.GenericKeyedObjectPool;
 import org.apache.commons.pool.impl.GenericObjectPool;
 
 import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterInputStream;
 import de.anomic.htmlFilter.htmlFilterWriter;
+import de.anomic.http.httpHeader;
 import de.anomic.http.httpc;
+import de.anomic.net.URL;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
 import de.anomic.plasma.parser.ParserInfo;
@@ -556,29 +558,7 @@ public final class plasmaParser {
             if (theParser != null) {
                 return theParser.parse(location, mimeType,documentCharset,sourceFile);
             } else if (realtimeParsableMimeTypesContains(mimeType)) {                      
-                // ...otherwise we make a scraper and transformer
-                htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
-                
-                // set the charset if known
-                /*if (charset != null) {
-                    try {
-                    scraper.setCharset(charset);
-                    } catch (UnsupportedCharsetException e) {
-                        serverLog.logWarning("PARSER", "parseSource2: unknown or unsupported charset '" + charset + "'");
-                        return null;                        
-                    }
-                }*/               
-                htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false);
-                serverFileUtils.copy(sourceFile, documentCharset, writer);
-                writer.close();
-                //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
-                //serverFileUtils.copy(sourceFile, hfos);
-                //hfos.close();
-                if (writer.binarySuspect()) {
-                    this.theLogger.logInfo("Binary data found in URL " + location);
-                    return null;
-                }
-                return transformScraper(location, mimeType, documentCharset, scraper);
+                return parseHtml(location, mimeType, documentCharset, sourceFile);
             } else {
                 serverLog.logWarning("PARSER", "parseSource2: wrong mime type");
                 return null;
@@ -594,16 +574,36 @@ public final class plasmaParser {
         }
     }
     
+    private plasmaParserDocument parseHtml(URL location, String mimeType, String documentCharset, File sourceFile) throws IOException {
+        
+        // ...otherwise we make a scraper and transformer
+        FileInputStream fileIn = new FileInputStream(sourceFile);
+        htmlFilterInputStream htmlFilter = new htmlFilterInputStream(fileIn,documentCharset,location,null,false);
+        String charset = htmlFilter.detectCharset();
+        if (charset == null) {
+            charset = documentCharset;
+        }
+        if (!documentCharset.equalsIgnoreCase(charset)) {
+            this.theLogger.logInfo("Charset transformation needed from '" + documentCharset + "' to '" + charset + "'");
+        }
+        
+        // parsing the content
+        htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);        
+        htmlFilterWriter writer = new htmlFilterWriter(null,null,scraper,null,false);
+        serverFileUtils.copy(htmlFilter, writer, charset);
+        writer.close();
+        //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
+        //serverFileUtils.copy(sourceFile, hfos);
+        //hfos.close();
+        if (writer.binarySuspect()) {
+            this.theLogger.logInfo("Binary data found in URL " + location);
+            return null;
+        }
+        return transformScraper(location, mimeType, documentCharset, scraper);        
+    }
+    
     public plasmaParserDocument transformScraper(URL location, String mimeType, String charSet, htmlFilterContentScraper scraper) {
         try {
-            if (scraper.getMetas().containsKey("content-type")) {
-                String newCharset = (String) scraper.getMetas().get("content-type");
-                if (!charSet.equals(newCharset)) {
-                    // TODO: transformation of content needed
-                    this.theLogger.logFine("Charset transformation needed from '" + charSet + "' to '" + newCharset + "'");
-                }
-            }
-            
             String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
             int p = 0;
             for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];