From 564184909a3ba14a43211c32546fcc5e984d3f72 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 1 Apr 2011 11:05:42 +0000 Subject: [PATCH] enhanced the surrogate parser: better reading of UTF-8 characters git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7634 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/net/yacy/cora/document/RSSMessage.java | 1 - .../yacy/document/content/SurrogateReader.java | 18 ++++++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 83306a77b..d47156976 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -164,7 +164,6 @@ public class RSSMessage implements Hit, Comparable, Comparator= 0) return subject.split(","); if (subject.indexOf(';') >= 0) return subject.split(";"); - if (subject.indexOf('|') >= 0) return subject.split("|"); return subject.split(" "); } diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 680139f19..d52e436d5 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -25,10 +25,13 @@ package net.yacy.document.content; import java.io.BufferedInputStream; +import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.zip.GZIPInputStream; @@ -41,6 +44,7 @@ import javax.xml.parsers.SAXParserFactory; import net.yacy.kelondro.logging.Log; import org.xml.sax.Attributes; +import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; @@ -55,7 +59,8 @@ public class SurrogateReader extends DefaultHandler implements Runnable { private String elementName; private final BlockingQueue surrogates; private SAXParser saxParser; - private final InputStream stream; + private final InputSource inputSource; + private final InputStream inputStream; public SurrogateReader(final InputStream stream, int queueSize) throws IOException { this.buffer = new StringBuilder(300); @@ -63,7 +68,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable { this.surrogate = null; this.elementName = null; this.surrogates = new ArrayBlockingQueue(queueSize); - this.stream = stream; + + Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + this.inputSource = new InputSource(reader); + this.inputSource.setEncoding("UTF-8"); + this.inputStream = stream; + final SAXParserFactory factory = SAXParserFactory.newInstance(); try { this.saxParser = factory.newSAXParser(); @@ -78,7 +88,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public void run() { try { - this.saxParser.parse(this.stream, this); + this.saxParser.parse(this.inputSource, this); } catch (SAXParseException e) { Log.logException(e); } catch (SAXException e) { @@ -92,7 +102,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { Log.logException(e1); } try { - this.stream.close(); + this.inputStream.close(); } catch (IOException e) { Log.logException(e); }