*) charset aware metadata parsing

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2598 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 3ac30bdf22
commit 8115ac47b5

@ -185,16 +185,16 @@ public final class htmlFilterOutputStream extends OutputStream {
if (opening) {
if ((scraper != null) && (scraper.isTag0(tag))) {
// this single tag is collected at once here
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser());
scraper.scrapeTag0(tag, new serverByteBuffer(content).propParser(scraper.getCharset()));
}
if ((transformer != null) && (transformer.isTag0(tag))) {
// this single tag is collected at once here
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(), quotechar);
return transformer.transformTag0(tag, new serverByteBuffer(content).propParser(scraper.getCharset()), quotechar);
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
((transformer != null) && (transformer.isTag1(tag)))) {
// ok, start collecting
filterTag = tag;
filterOpts = new serverByteBuffer(content).propParser();
filterOpts = new serverByteBuffer(content).propParser(scraper.getCharset());
filterCont = new serverByteBuffer();
return new byte[0];
} else {

@ -40,6 +40,7 @@
package de.anomic.htmlFilter;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Properties;
public interface htmlFilterScraper {
@ -55,5 +56,15 @@ public interface htmlFilterScraper {
public void scrapeTag1(String tagname, Properties tagopts, byte[] text);
public void close();
/**
* @param charset the charset of the source document
* @throws UnsupportedCharsetException
*/
public void setCharset(String charset) throws UnsupportedCharsetException;
/**
* @return charset the charset of the source document
*/
public String getCharset();
}

@ -359,7 +359,7 @@ public final class serverByteBuffer extends OutputStream {
return new String(buffer, offset + left, rightbound - left);
}
public Properties propParser() {
public Properties propParser(String charset) {
// extract a=b or a="b" - relations from the buffer
int pos = offset;
int start;
@ -372,7 +372,11 @@ public final class serverByteBuffer extends OutputStream {
start = pos;
while ((pos < length) && (buffer[pos] != equal)) pos++;
if (pos >= length) break; // this is the case if we found no equal
key = new String(buffer, start, pos - start).trim().toLowerCase();
try {
key = new String(buffer, start, pos - start,charset).trim().toLowerCase();
} catch (UnsupportedEncodingException e1) {
key = new String(buffer, start, pos - start).trim().toLowerCase();
}
// we have a key
pos++;
// find start of value
@ -389,7 +393,7 @@ public final class serverByteBuffer extends OutputStream {
while ((pos < length) && (buffer[pos] != doublequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent doublequote
try {
p.setProperty(key, new String(buffer, start, pos - start,"UTF-8").trim());
p.setProperty(key, new String(buffer, start, pos - start,charset).trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
@ -400,13 +404,21 @@ public final class serverByteBuffer extends OutputStream {
start = pos;
while ((pos < length) && (buffer[pos] != singlequote)) pos++;
if (pos >= length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(buffer, start, pos - start).trim());
try {
p.setProperty(key, new String(buffer, start, pos - start,charset).trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
pos++;
} else {
// search next whitespace
start = pos;
while ((pos < length) && (buffer[pos] > 32)) pos++;
p.setProperty(key, new String(buffer, start, pos - start).trim());
try {
p.setProperty(key, new String(buffer, start, pos - start,charset).trim());
} catch (UnsupportedEncodingException e) {
p.setProperty(key, new String(buffer, start, pos - start).trim());
}
}
// pos should point now to a whitespace: eat up spaces
while ((pos < length) && (buffer[pos] <= 32)) pos++;

Loading…
Cancel
Save