*) Bugfix. htmlFilterInputStream document analysis did not work properly for documents smaller than the

default InputStream Buffer size.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2629 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent f18304ddd3
commit cf6acff2c2

@ -15,12 +15,15 @@ import de.anomic.net.URL;
public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener { public class htmlFilterInputStream extends InputStream implements htmlFilterEventListener {
private static final int MODE_PRESCAN = 0;
private static final int MODE_PRESCAN_FINISHED = 1;
private int mode = 1;
private long preBufferSize = 143336; private long preBufferSize = 143336;
private long preRead = 0; private long preRead = 0;
private BufferedInputStream bufferedIn; private BufferedInputStream bufferedIn;
private String detectedCharset; private String detectedCharset;
private int mode = 0;
private boolean charsetChanged = false; private boolean charsetChanged = false;
private boolean endOfHead = false; private boolean endOfHead = false;
@ -52,13 +55,19 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
if (tagopts.containsKey("http-equiv")) { if (tagopts.containsKey("http-equiv")) {
String value = tagopts.getProperty("http-equiv"); String value = tagopts.getProperty("http-equiv");
if (value.equalsIgnoreCase("Content-Type")) { if (value.equalsIgnoreCase("Content-Type")) {
String contentType = tagopts.getProperty("content"); String contentType = tagopts.getProperty("content","");
this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType); this.detectedCharset = httpHeader.extractCharsetFromMimetyeHeader(contentType);
if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
this.charsetChanged = true;
} else if (tagopts.containsKey("charset")) {
// sometimes the charset property is configured as extra attribut. try it ...
this.detectedCharset = tagopts.getProperty("charset");
this.charsetChanged = true; this.charsetChanged = true;
} }
} }
} }
} }
}
public void scrapeTag1(String tagname, Properties tagopts, char[] text) { public void scrapeTag1(String tagname, Properties tagopts, char[] text) {
if (tagname == null || tagname.length() == 0) return; if (tagname == null || tagname.length() == 0) return;
@ -69,6 +78,8 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
} }
public String detectCharset() throws IOException { public String detectCharset() throws IOException {
this.mode = MODE_PRESCAN;
// loop until we have detected the header element or the charset data // loop until we have detected the header element or the charset data
int c; int c;
while ((c = this.reader.read())!= -1) { while ((c = this.reader.read())!= -1) {
@ -79,12 +90,16 @@ public class htmlFilterInputStream extends InputStream implements htmlFilterEven
this.writer = null; this.writer = null;
// don't close writer here, otherwise it will shutdown our source stream // don't close writer here, otherwise it will shutdown our source stream
// reset the buffer if not already done
if (this.mode != MODE_PRESCAN_FINISHED) this.bufferedIn.reset();
// return scanning result
return (this.charsetChanged) ? this.detectedCharset : null; return (this.charsetChanged) ? this.detectedCharset : null;
} }
public int read() throws IOException { public int read() throws IOException {
// mode 0 is called from within the detectCharset function // mode 0 is called from within the detectCharset function
if (this.mode == 0) { if (this.mode == MODE_PRESCAN) {
if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) { if (this.endOfHead || this.charsetChanged || this.preRead >= this.preBufferSize) {
this.mode++; this.mode++;
this.bufferedIn.reset(); this.bufferedIn.reset();

@ -1706,7 +1706,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() + log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" + " [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription + "\n\tDescription: " + docDescription +
"\n\tMimeType: " + document.getMimeType() + " | " + "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.text.length + " bytes | " + "Size: " + document.text.length + " bytes | " +
"Anchors: " + ((document.anchors==null)?0:document.anchors.size()) + "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " + "\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +

Loading…
Cancel
Save