*) Bugfix. htmlFilterInputStream document analysis did not work properly for documents smaller than the

default InputStream Buffer size. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2629 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · cf6acff2c2
parent f18304ddd3
commit cf6acff2c2
2 changed files with 20 additions and 5 deletions
--- a/source/de/anomic/htmlFilter/htmlFilterInputStream.java
+++ b/source/de/anomic/htmlFilter/htmlFilterInputStream.java
@ -15,12 +15,15 @@ import de.anomic.net.URL;
 public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener {
    private static final int MODE_PRESCAN = 0;
    private static final int MODE_PRESCAN_FINISHED = 1;
    private int mode = 1;
    private long preBufferSize = 143336;
    private long preRead = 0;
    private BufferedInputStream bufferedIn;
    private String detectedCharset;
    private int mode = 0;
    private boolean charsetChanged = false;
    private boolean endOfHead = false;
@ -52,13 +55,19 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
            if (tagopts.containsKey("http-equiv")) {
                String value = tagopts.getProperty("http-equiv");
                if (value.equalsIgnoreCase("Content-Type")) {
-                    String contentType = tagopts.getProperty("content");
+                    String contentType = tagopts.getProperty("content","");
                    this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
                    if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
                        this.charsetChanged = true;
                    } else if (tagopts.containsKey("charset")) { 
                        // sometimes the charset property is configured as extra attribut. try it ...
                        this.detectedCharset = tagopts.getProperty("charset");
                        this.charsetChanged = true;
                    }
                }
            }
        }
    }
    public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
        if (tagname == null || tagname.length() == 0) return;
@ -69,6 +78,8 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
    }
    public String detectCharset() throws IOException {
        this.mode = MODE_PRESCAN; 
        // loop until we have detected the header element or the charset data
        int c;
        while ((c = this.reader.read())!= -1) {
@ -79,12 +90,16 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
        this.writer = null;        
        // don't close writer here, otherwise it will shutdown our source stream 
        // reset the buffer if not already done
        if (this.mode != MODE_PRESCAN_FINISHED) this.bufferedIn.reset();
        // return scanning result
        return (this.charsetChanged) ? this.detectedCharset : null;
    }
    public int read() throws IOException {
        // mode 0 is called from within the detectCharset function
-        if (this.mode == 0) {      
+        if (this.mode == MODE_PRESCAN) {      
            if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) {
                this.mode++;
                this.bufferedIn.reset();
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1706,7 +1706,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
                                    " [" + entry.urlHash() + "]" +
                                    "\n\tDescription:  " + docDescription +
-                                    "\n\tMimeType: "  + document.getMimeType() + " | " +
+                                    "\n\tMimeType: "  + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
                                    "Size: " + document.text.length + " bytes | " +
                                    "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
                                    "\n\tStackingTime:  " + (stackEndTime-stackStartTime) + " ms | " +