enhanced the surrogate parser: better reading of UTF-8 characters

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7634 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 156cf02703
commit 564184909a

@ -164,7 +164,6 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
String subject = Token.subject.valueFrom(this.map, "");
if (subject.indexOf(',') >= 0) return subject.split(",");
if (subject.indexOf(';') >= 0) return subject.split(";");
if (subject.indexOf('|') >= 0) return subject.split("|");
return subject.split(" ");
}

@ -25,10 +25,13 @@
package net.yacy.document.content;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.zip.GZIPInputStream;
@ -41,6 +44,7 @@ import javax.xml.parsers.SAXParserFactory;
import net.yacy.kelondro.logging.Log;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
@ -55,7 +59,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private String elementName;
private final BlockingQueue<DCEntry> surrogates;
private SAXParser saxParser;
private final InputStream stream;
private final InputSource inputSource;
private final InputStream inputStream;
public SurrogateReader(final InputStream stream, int queueSize) throws IOException {
this.buffer = new StringBuilder(300);
@ -63,7 +68,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.surrogate = null;
this.elementName = null;
this.surrogates = new ArrayBlockingQueue<DCEntry>(queueSize);
this.stream = stream;
Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
this.inputSource = new InputSource(reader);
this.inputSource.setEncoding("UTF-8");
this.inputStream = stream;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
this.saxParser = factory.newSAXParser();
@ -78,7 +88,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
public void run() {
try {
this.saxParser.parse(this.stream, this);
this.saxParser.parse(this.inputSource, this);
} catch (SAXParseException e) {
Log.logException(e);
} catch (SAXException e) {
@ -92,7 +102,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
Log.logException(e1);
}
try {
this.stream.close();
this.inputStream.close();
} catch (IOException e) {
Log.logException(e);
}

Loading…
Cancel
Save