refactoring and new usage of SentenceReader: this class appeared as one

of the major CPU users during snippet verification. The class was not efficient for two reasons: - it used a too complex input stream; generated from sources and UTF8 byte-conversions. The BufferedReader applied a strong overhead. - to feed data into the SentenceReader, multiple toString/getBytes had been applied until a buffered Reader from an input stream was possible. These superfluous conversions had been removed. - the best source for the Sentence Reader is a String. Therefore the production of Strings had been forced inside the Document class.
13 years ago · 78fc3cf8f8
parent bb8dcb4911
commit 78fc3cf8f8
24 changed files with 130 additions and 177 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -27,7 +27,6 @@
 //javac -classpath .:../Classes Status.java
 //if the shell's current path is HTROOT

-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Collection;
@ -47,6 +46,7 @@ import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser;
+import net.yacy.document.SentenceReader;
 import net.yacy.document.WordTokenizer;
 import net.yacy.document.parser.html.CharacterCoding;
 import net.yacy.document.parser.html.ImageEntry;
@ -232,7 +232,7 @@ public class ViewFile {
            }

            if (viewMode.equals("parsed")) {
-                final String content = UTF8.String(document.getTextBytes());
+                final String content = document.getTextString();
                // content = wikiCode.replaceHTML(content); // added by Marc Nause
                prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
                prop.put("viewMode_title", document.dc_title());
@ -284,7 +284,7 @@ public class ViewFile {
                    for (final StringBuilder s: sentences) {
                        sentence = s.toString();
                        Enumeration<StringBuilder> tokens = null;
-                        tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
+                        tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
                        while (tokens.hasMoreElements()) {
                            token = tokens.nextElement();
                            if (token.length() > 0) {
--- a/source/de/anomic/data/ymark/YMarkAutoTagger.java
+++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java
@ -1,6 +1,5 @@
 package de.anomic.data.ymark;

-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Arrays;
@ -11,12 +10,12 @@ import java.util.TreeMap;
 import java.util.TreeSet;
 import java.util.concurrent.ArrayBlockingQueue;

-import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.document.Condenser;
 import net.yacy.document.Document;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.Parser.Failure;
+import net.yacy.document.SentenceReader;
 import net.yacy.document.WordTokenizer;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.data.word.Word;
@ -100,7 +99,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 		buffer.append(document.dc_title().toLowerCase());
 		buffer.append(document.dc_description().toLowerCase());
 		buffer.append(document.dc_subject(' ').toLowerCase());
-		final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
+		final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
 		try {
 			int score = 0;

@ -177,7 +176,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
 	private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
 		final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
 		final StringBuilder phrase = new StringBuilder(128);
-		final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
+		final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
 		try {
 			StringBuilder token;
 			int count = 0;
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -20,12 +20,10 @@

 package net.yacy.document;

-import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
@ -39,7 +37,6 @@ import java.util.TreeMap;
 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.Classification.ContentDomain;
 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.UTF8;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.document.language.Identificator;
 import net.yacy.document.parser.html.ImageEntry;
@ -133,7 +130,7 @@ public final class Condenser {

        Map.Entry<MultiProtocolURI, String> entry;
        if (indexText) {
-            createCondensement(document.getText(), meaningLib, doAutotagging);
+            createCondensement(document.getTextString(), meaningLib, doAutotagging);
            // the phrase counter:
            // phrase   0 are words taken from the URL
            // phrase   1 is the MainTitle
@ -146,16 +143,15 @@ public final class Condenser {
            // phrase  98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
            // phrase  99 is taken from the media Link url and anchor description
            // phrase 100 and above are lines from the text
-
-            insertTextToWords(document.dc_title(),       1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(document.dc_creator(),     4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(document.dc_publisher(),   5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
-            insertTextToWords(document.dc_subject(' '),  6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_title()),       1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_creator()),     4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_publisher()),   5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
+            insertTextToWords(new SentenceReader(document.dc_subject(' ')),  6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
            // missing: tags!
            final String[] titles = document.getSectionTitles();
            for (int i = 0; i < titles.length; i++) {
-                insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
+                insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
            }

            // anchors: for text indexing we add only the anchor description
@ -180,7 +176,7 @@ public final class Condenser {
        }

        // add the URL components to the word list
-        insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
+        insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);

        if (indexMedia) {
            // add anchor descriptions: here, we also add the url components
@ -188,24 +184,24 @@ public final class Condenser {
            Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
            }

            // video
            i = document.getVideolinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
            }

            // applications
            i = document.getApplinks().entrySet().iterator();
            while (i.hasNext()) {
                entry = i.next();
-                insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
            }

            // images
@ -216,8 +212,8 @@ public final class Condenser {
                ientry = j.next();
                url = ientry.url();
                if (url == null) continue;
-                insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
-                insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
+                insertTextToWords(new SentenceReader(url.toNormalform(false, false)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
+                insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
            }

            // finally check all words for missing flag entry
@ -241,7 +237,7 @@ public final class Condenser {
    }

    private void insertTextToWords(
-            final String text,
+            final SentenceReader text,
            final int phrase,
            final int flagpos,
            final Bitfield flagstemplate,
@ -250,7 +246,7 @@ public final class Condenser {
        if (text == null) return;
        String word;
        Word wprop;
-        WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
+        WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
        try {
 	        int pip = 0;
 	        while (wordenum.hasMoreElements()) {
@ -271,7 +267,7 @@ public final class Condenser {
        }
    }

-    public Condenser(final InputStream text, final WordCache meaningLib, boolean doAutotagging) {
+    public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
        this.languageIdentificator = null; // we don't need that here
        // analysis = new Properties();
        this.words = new TreeMap<String, Word>();
@ -295,8 +291,8 @@ public final class Condenser {
        return this.languageIdentificator.getLanguage();
    }

-    private void createCondensement(final InputStream is, final WordCache meaningLib, boolean doAutotagging) {
-        assert is != null;
+    private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
+        assert text != null;
        final Set<String> currsentwords = new HashSet<String>();
        String word = "";
        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
@ -317,7 +313,7 @@ public final class Condenser {
        if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
        
        // read source
-        final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
+        final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
        try {
 	        while (wordenum.hasMoreElements()) {
 	            word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
@ -430,9 +426,7 @@ public final class Condenser {
    public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
        // returns a word/indexWord relation map
        if (text == null) return null;
-        ByteArrayInputStream buffer;
-		buffer = new ByteArrayInputStream(UTF8.getBytes(text));
-        return new Condenser(buffer, meaningLib, false).words();
+        return new Condenser(text, meaningLib, false).words();
    }

    public static void main(final String[] args) {
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -133,7 +133,7 @@ public class Document {
        this.outboundlinks = null;
        this.languages = languages;
        this.indexingDenied = indexingDenied;
-        this.text = text == null ? new ByteArrayOutputStream() : text;
+        this.text = text == null ? "" : text;
    }

    public Object getParserObject() {
@ -299,7 +299,7 @@ dc_rights
        return this.sections.toArray(new String[this.sections.size()]);
    }

-    public InputStream getText() {
+    public InputStream getTextStream() {
        try {
            if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes(""));
            if (this.text instanceof String) {
@ -322,26 +322,26 @@ dc_rights
        return new ByteArrayInputStream(UTF8.getBytes(""));
    }

-    public byte[] getTextBytes() {
+    public String getTextString() {
        try {
-            if (this.text == null) return new byte[0];
+            if (this.text == null) return "";
            if (this.text instanceof String) {
-                return UTF8.getBytes((String) this.text);
+                return (String) this.text;
            } else if (this.text instanceof InputStream) {
-                return FileUtils.read((InputStream) this.text);
+                return UTF8.String(FileUtils.read((InputStream) this.text));
            } else if (this.text instanceof File) {
-                return FileUtils.read((File) this.text);
+                return UTF8.String(FileUtils.read((File) this.text));
            } else if (this.text instanceof byte[]) {
-                return (byte[]) this.text;
+                return UTF8.String((byte[]) this.text);
            } else if (this.text instanceof ByteArrayOutputStream) {
-                return ((ByteArrayOutputStream) this.text).toByteArray();
+                return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
            }
            assert false : this.text.getClass().toString();
            return null;
        } catch (final Exception e) {
            Log.logException(e);
        }
-        return new byte[0];
+        return "";
    }

    public long getTextLength() {
@ -367,16 +367,11 @@ dc_rights
    }

    public List<StringBuilder> getSentences(final boolean pre) {
-        return getSentences(pre, getText());
-    }
-
-    public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
-        if (text == null) return null;
-        final SentenceReader e = new SentenceReader(text);
-        e.pre(pre);
-        final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
-        while (e.hasNext()) {
-            sentences.add(e.next());
+        final SentenceReader sr = new SentenceReader(getTextString());
+        sr.pre(pre);
+        List<StringBuilder> sentences = new ArrayList<StringBuilder>();
+        while (sr.hasNext()) {
+            sentences.add(sr.next());
        }
        return sentences;
    }
@ -638,7 +633,7 @@ dc_rights
            if (!(this.text instanceof ByteArrayOutputStream)) {
                this.text = new ByteArrayOutputStream();
            }
-            FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
+            FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text);

            this.anchors.putAll(doc.getAnchors());
            this.rss.putAll(doc.getRSS());
@ -707,11 +702,7 @@ dc_rights
        if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n");
        if (this.text != null) {
            os.write("<dc:description><![CDATA[");
-            final byte[] buffer = new byte[1000];
-            int c = 0;
-            final InputStream is = getText();
-            while ((c = is.read(buffer)) > 0) os.write(UTF8.String(buffer, 0, c));
-            is.close();
+            os.write(getTextString());
            os.write("]]></dc:description>\n");
        }
        final String language = dc_language();
@ -811,7 +802,7 @@ dc_rights
            if (doc.getTextLength() > 0) {
                if (docTextLength > 0) content.write('\n');
                try {
-                    docTextLength += FileUtils.copy(doc.getText(), content);
+                    docTextLength += FileUtils.copy(doc.getTextStream(), content);
                } catch (final IOException e) {
                    Log.logException(e);
                }
--- a/source/net/yacy/document/SentenceReader.java
+++ b/source/net/yacy/document/SentenceReader.java
@ -24,12 +24,9 @@

 package net.yacy.document;

-import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.Reader;
-import java.io.UnsupportedEncodingException;
+import java.io.StringReader;
 import java.util.Iterator;

 public class SentenceReader implements Iterator<StringBuilder> {
@ -37,17 +34,13 @@ public class SentenceReader implements Iterator<StringBuilder> {
    // this enumerates StringBuilder objects
    
    private StringBuilder buffer;
-    private BufferedReader raf;
+    private Reader raf;
    private int counter = 0;
    private boolean pre = false;
-
-    public SentenceReader(final InputStream is) {
-        assert is != null;
-        try {
-            raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
-        } catch (UnsupportedEncodingException e) {
-            e.printStackTrace();
-        }
+    
+    public SentenceReader(final String text) {
+    	assert text != null;
+        raf = new StringReader(text);
        buffer = nextElement0();
        counter = 0;
        pre = false;
@ -144,9 +137,8 @@ public class SentenceReader implements Iterator<StringBuilder> {
    
    public synchronized void close() {
    	try {
-    		raf.close();
-    	} catch(IOException ioe) {
-    		// Ignore IO Exceptions
-    	}
+			raf.close();
+		} catch (IOException e) {
+		}
    }
 }
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -178,8 +178,7 @@ public final class TextParser {
        } finally {
            if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
        }
-        for (final Document d: docs) { assert d.getText() != null; } // verify docs
-
+        
        return docs;
    }

@ -261,7 +260,6 @@ public final class TextParser {
        if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
        try {
            final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
-            for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs
            return docs;
        } catch (final Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -324,7 +322,7 @@ public final class TextParser {
                throw new Parser.Failure("All parser failed: " + failedParsers, location);
            }
        }
-        for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
+        for (final Document d: docs) { assert d.getTextStream() != null : "mimeType = " + mimeType; } // verify docs

        return docs;
    }
--- a/source/net/yacy/document/WordTokenizer.java
+++ b/source/net/yacy/document/WordTokenizer.java
@ -24,15 +24,12 @@

 package net.yacy.document;

-import java.io.ByteArrayInputStream;
-import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Enumeration;
 import java.util.List;
 import java.util.SortedMap;
 import java.util.TreeMap;

-import net.yacy.cora.document.UTF8;
 import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.order.Base64Order;

@ -44,9 +41,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
    private final unsievedWordsEnum e;
    private final WordCache meaningLib;

-    public WordTokenizer(final InputStream is, final WordCache meaningLib) {
-        assert is != null;
-        this.e = new unsievedWordsEnum(is);
+    public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) {
+        assert sr != null;
+        this.e = new unsievedWordsEnum(sr);
        this.buffer = nextElement0();
        this.meaningLib = meaningLib;
    }
@ -89,20 +86,20 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
    private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
        // returns an enumeration of StringBuilder Objects
        private StringBuilder buffer = null;
-        private final SentenceReader e;
+        private final SentenceReader sr;
        private final List<StringBuilder> s;
        private int sIndex;

-        public unsievedWordsEnum(final InputStream is) {
-            assert is != null;
-            this.e = new SentenceReader(is);
+        public unsievedWordsEnum(final SentenceReader sr0) {
+            assert sr0 != null;
+            this.sr = sr0;
            this.s = new ArrayList<StringBuilder>();
            this.sIndex = 0;
            this.buffer = nextElement0();
        }

        public void pre(final boolean x) {
-            this.e.pre(x);
+            this.sr.pre(x);
        }

        private StringBuilder nextElement0() {
@ -114,8 +111,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
                this.s.clear();
            }
            while (this.s.isEmpty()) {
-                if (!this.e.hasNext()) return null;
-                r = this.e.next();
+                if (!this.sr.hasNext()) return null;
+                r = this.sr.next();
                if (r == null) return null;
                r = trim(r);
                sb = new StringBuilder(20);
@ -154,7 +151,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
        }

        public synchronized void close() {
-        	this.e.close();
+        	this.sr.close();
        }
    }

@ -183,7 +180,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
     */
    public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
        final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
-        final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
+        final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
        try {
 	        int pos = 0;
 	        StringBuilder word;
--- a/source/net/yacy/document/content/DCEntry.java
+++ b/source/net/yacy/document/content/DCEntry.java
@ -36,12 +36,10 @@ import java.util.Locale;
 import java.util.TreeMap;

 import net.yacy.cora.date.ISO8601Formatter;
-import net.yacy.cora.document.UTF8;
 import net.yacy.document.Document;
 import net.yacy.kelondro.data.meta.DigestURI;
 import net.yacy.kelondro.logging.Log;

-
 public class DCEntry extends TreeMap<String, String> {

    private static final long    serialVersionUID = -2050291583515701559L;
@ -277,7 +275,7 @@ public class DCEntry extends TreeMap<String, String> {
            null,
            "",
            getLon(), getLat(),
-            UTF8.getBytes(getDescription()),
+            getDescription(),
            null,
            null,
            null,
--- a/source/net/yacy/document/parser/augment/AugmentParser.java
+++ b/source/net/yacy/document/parser/augment/AugmentParser.java
@ -81,7 +81,7 @@ public class AugmentParser extends AbstractParser implements Parser {

 			all = "yacylatest";
 			newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
-					"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+					"", null, "", 0, 0, all, null, null, null, false);
 		}

 		return newDoc;
@ -94,7 +94,7 @@ public class AugmentParser extends AbstractParser implements Parser {
 		String all = "";

 		Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
-				"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+				"", null, "", 0, 0, all, null, null, null, false);


 		Iterator<net.yacy.kelondro.blob.Tables.Row> it;
--- a/source/net/yacy/document/parser/csvParser.java
+++ b/source/net/yacy/document/parser/csvParser.java
@ -60,28 +60,24 @@ public class csvParser extends AbstractParser implements Parser {
        for (final String[] row: table) {
            sb.append(concatRow(row)).append(' ');
        }
-        try {
-            return new Document[]{new Document(
-                    location,
-                    mimeType,
-                    charset,
-                    this,
-                    null,
-                    null,
-                    concatRow(table.get(0)),
-                    "",
-                    "",
-                    null,
-                    null,
-                    0.0f, 0.0f,
-                    sb.toString().getBytes(charset),
-                    null,
-                    null,
-                    null,
-                    false)};
-        } catch (UnsupportedEncodingException e) {
-            throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location);
-        }
+        return new Document[]{new Document(
+		        location,
+		        mimeType,
+		        charset,
+		        this,
+		        null,
+		        null,
+		        concatRow(table.get(0)),
+		        "",
+		        "",
+		        null,
+		        null,
+		        0.0f, 0.0f,
+		        sb.toString(),
+		        null,
+		        null,
+		        null,
+		        false)};
    }

    private String concatRow(String[] columns) {
--- a/source/net/yacy/document/parser/docParser.java
+++ b/source/net/yacy/document/parser/docParser.java
@ -28,13 +28,10 @@
 package net.yacy.document.parser;

 import java.io.InputStream;
-
-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.kelondro.data.meta.DigestURI;
-
 import org.apache.poi.hwpf.extractor.WordExtractor;

 public class docParser extends AbstractParser implements Parser {
@ -99,7 +96,7 @@ public class docParser extends AbstractParser implements Parser {
                  null,
                  null,
                  0.0f, 0.0f,
-                  UTF8.getBytes(contents.toString()),
+                  contents.toString(),
                  null,
                  null,
                  null,
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@ -65,9 +65,6 @@ public class genericParser extends AbstractParser implements Parser {
                null,
                null,
                false)};
-        for (final Document d: docs) {
-            assert d.getText() != null : "mimeType = " + mimeType;
-        } // verify docs
        return docs;
    }
 }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -626,12 +626,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        return false;
    }

-    public byte[] getText() {
+    public String getText() {
        try {
-            return this.content.getBytes();
+            return this.content.toString();
        } catch (final OutOfMemoryError e) {
            Log.logException(e);
-            return new byte[0];
+            return "";
        }
    }

--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -43,7 +43,6 @@ import java.util.Set;
 import javax.imageio.ImageIO;

 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -206,7 +205,7 @@ public class genericImageParser extends AbstractParser implements Parser {
             new String[]{}, // sections
             description == null ? "" : description, // description
             0.0f, 0.0f, // TODO parse location
-             UTF8.getBytes(infoString), // content text
+             infoString, // content text
             anchors, // anchors
             null,
             images,
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -250,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
                    try {
                        // write file
-                        FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
+                        FileUtils.copy(document.getTextStream(), new File("parsedPdf.txt"));
                    } catch (final IOException e) {
                        System.err.println("error saving parsed document");
                        Log.logException(e);
--- a/source/net/yacy/document/parser/pptParser.java
+++ b/source/net/yacy/document/parser/pptParser.java
@ -30,7 +30,6 @@ package net.yacy.document.parser;
 import java.io.BufferedInputStream;
 import java.io.InputStream;

-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -95,7 +94,7 @@ public class pptParser extends AbstractParser implements Parser {
                    null,
                    null,
                    0.0f, 0.0f,
-                    UTF8.getBytes(contents),
+                    contents,
                    null,
                    null,
                    null,
--- a/source/net/yacy/document/parser/rdfParser.java
+++ b/source/net/yacy/document/parser/rdfParser.java
@ -58,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {

        String all = "rdfdatasource";
 		doc = new Document(url, mimeType, charset, null, null, null, "", "",
-					"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+					"", null, "", 0, 0, all, null, null, null, false);

        docs.add(doc);

--- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
+++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java
@ -142,7 +142,7 @@ public class RDFaParser extends AbstractParser implements Parser {
 		}

 		Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
-				"", null, "", 0, 0, all.getBytes(), null, null, null, false);
+				"", null, "", 0, 0, all, null, null, null, false);
 		return doc;
 	}

--- a/source/net/yacy/document/parser/rtfParser.java
+++ b/source/net/yacy/document/parser/rtfParser.java
@ -32,7 +32,6 @@ import java.io.InputStream;
 import javax.swing.text.DefaultStyledDocument;
 import javax.swing.text.rtf.RTFEditorKit;

-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -80,7 +79,7 @@ public class rtfParser extends AbstractParser implements Parser {
                    null,
                    null,
                    0.0f, 0.0f,
-                    UTF8.getBytes(bodyText),
+                    bodyText,
                    null,
                    null,
                    null,
--- a/source/net/yacy/document/parser/swfParser.java
+++ b/source/net/yacy/document/parser/swfParser.java
@ -34,7 +34,6 @@ import java.util.Map;
 import java.util.Properties;

 import net.yacy.cora.document.MultiProtocolURI;
-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -121,7 +120,7 @@ public class swfParser extends AbstractParser implements Parser {
                    sections,     // an array of section headlines
                    abstrct,     // an abstract
                    0.0f, 0.0f,
-                    UTF8.getBytes(contents),     // the parsed document text
+                    contents,     // the parsed document text
                    anchors,      // a map of extracted anchors
                    null,
                    null,
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
 import java.util.List;
 import java.util.Map;

@ -94,28 +93,24 @@ public class torrentParser extends AbstractParser implements Parser {
            if (nameo != null) title = UTF8.String(nameo.getString());
        }
        if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
-        try {
-            return new Document[]{new Document(
-                    location,
-                    mimeType,
-                    charset,
-                    this,
-                    null,
-                    null,
-                    title, // title
-                    comment, // author
-                    location.getHost(),
-                    null,
-                    null,
-                    0.0f, 0.0f,
-                    filenames.toString().getBytes(charset),
-                    null,
-                    null,
-                    null,
-                    false)};
-        } catch (UnsupportedEncodingException e) {
-            throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location);
-        }
+        return new Document[]{new Document(
+		        location,
+		        mimeType,
+		        charset,
+		        this,
+		        null,
+		        null,
+		        title, // title
+		        comment, // author
+		        location.getHost(),
+		        null,
+		        null,
+		        0.0f, 0.0f,
+		        filenames.toString(),
+		        null,
+		        null,
+		        null,
+		        false)};
    }

    public static void main(String[] args) {
--- a/source/net/yacy/document/parser/vsdParser.java
+++ b/source/net/yacy/document/parser/vsdParser.java
@ -29,7 +29,6 @@ package net.yacy.document.parser;

 import java.io.InputStream;

-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -115,7 +114,7 @@ public class vsdParser extends AbstractParser implements Parser {
                    null,         // an array of section headlines
                    abstrct,      // an abstract
                    0.0f, 0.0f,
-                    UTF8.getBytes(contents),     // the parsed document text
+                    contents,     // the parsed document text
                    null,         // a map of extracted anchors
                    null,
                    null,         // a treeset of image URLs
--- a/source/net/yacy/document/parser/xlsParser.java
+++ b/source/net/yacy/document/parser/xlsParser.java
@ -29,7 +29,6 @@ package net.yacy.document.parser;

 import java.io.InputStream;

-import net.yacy.cora.document.UTF8;
 import net.yacy.document.AbstractParser;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
@ -126,7 +125,7 @@ public class xlsParser extends AbstractParser implements Parser {
                        null,
                        null,
                        0.0f, 0.0f,
-                        UTF8.getBytes(contents),
+                        contents,
                        null,
                        null,
                        null,
--- a/source/net/yacy/search/snippet/TextSnippet.java
+++ b/source/net/yacy/search/snippet/TextSnippet.java
@ -24,7 +24,6 @@

 package net.yacy.search.snippet;

-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collection;
@ -36,12 +35,12 @@ import java.util.SortedMap;
 import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
-import net.yacy.cora.document.UTF8;
 import net.yacy.cora.services.federated.yacy.CacheStrategy;
 import net.yacy.cora.storage.ARC;
 import net.yacy.cora.storage.ConcurrentARC;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
+import net.yacy.document.SentenceReader;
 import net.yacy.document.SnippetExtractor;
 import net.yacy.document.WordTokenizer;
 import net.yacy.document.parser.html.CharacterCoding;
@ -183,7 +182,13 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            // try the solr text first
            if (solrText != null) {
                // compute sentences from solr query
-                sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
+                final SentenceReader sr = new SentenceReader(solrText);
+                sr.pre(pre);
+                sentences = new ArrayList<StringBuilder>();
+                while (sr.hasNext()) {
+                    sentences.add(sr.next());
+                }
+                
                if (sentences != null) {
                    try {
                        final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);