From 78fc3cf8f8684a42980504313abb877eea2a2162 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 4 Jul 2012 21:15:10 +0200 Subject: [PATCH] refactoring and new usage of SentenceReader: this class appeared as one of the major CPU users during snippet verification. The class was not efficient for two reasons: - it used a too complex input stream; generated from sources and UTF8 byte-conversions. The BufferedReader applied a strong overhead. - to feed data into the SentenceReader, multiple toString/getBytes had been applied until a buffered Reader from an input stream was possible. These superfluous conversions had been removed. - the best source for the Sentence Reader is a String. Therefore the production of Strings had been forced inside the Document class. --- htroot/ViewFile.java | 6 +-- .../de/anomic/data/ymark/YMarkAutoTagger.java | 7 ++- source/net/yacy/document/Condenser.java | 52 ++++++++----------- source/net/yacy/document/Document.java | 45 +++++++--------- source/net/yacy/document/SentenceReader.java | 26 ++++------ source/net/yacy/document/TextParser.java | 6 +-- source/net/yacy/document/WordTokenizer.java | 27 +++++----- source/net/yacy/document/content/DCEntry.java | 4 +- .../parser/augment/AugmentParser.java | 4 +- .../net/yacy/document/parser/csvParser.java | 40 +++++++------- .../net/yacy/document/parser/docParser.java | 5 +- .../yacy/document/parser/genericParser.java | 3 -- .../document/parser/html/ContentScraper.java | 6 +-- .../parser/images/genericImageParser.java | 3 +- .../net/yacy/document/parser/pdfParser.java | 2 +- .../net/yacy/document/parser/pptParser.java | 3 +- .../net/yacy/document/parser/rdfParser.java | 2 +- .../document/parser/rdfa/impl/RDFaParser.java | 2 +- .../net/yacy/document/parser/rtfParser.java | 3 +- .../net/yacy/document/parser/swfParser.java | 3 +- .../yacy/document/parser/torrentParser.java | 41 +++++++-------- .../net/yacy/document/parser/vsdParser.java | 3 +- .../net/yacy/document/parser/xlsParser.java | 3 +- .../net/yacy/search/snippet/TextSnippet.java | 11 ++-- 24 files changed, 130 insertions(+), 177 deletions(-) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 1139c217e..f4a192fe2 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -27,7 +27,6 @@ //javac -classpath .:../Classes Status.java //if the shell's current path is HTROOT -import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.MalformedURLException; import java.util.Collection; @@ -47,6 +46,7 @@ import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser; +import net.yacy.document.SentenceReader; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.ImageEntry; @@ -232,7 +232,7 @@ public class ViewFile { } if (viewMode.equals("parsed")) { - final String content = UTF8.String(document.getTextBytes()); + final String content = document.getTextString(); // content = wikiCode.replaceHTML(content); // added by Marc Nause prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT); prop.put("viewMode_title", document.dc_title()); @@ -284,7 +284,7 @@ public class ViewFile { for (final StringBuilder s: sentences) { sentence = s.toString(); Enumeration tokens = null; - tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib); + tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib); while (tokens.hasMoreElements()) { token = tokens.nextElement(); if (token.length() > 0) { diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index 82f5deaa5..81b439787 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -1,6 +1,5 @@ package de.anomic.data.ymark; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.net.MalformedURLException; import java.util.Arrays; @@ -11,12 +10,12 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; -import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.Condenser; import net.yacy.document.Document; import net.yacy.document.LibraryProvider; import net.yacy.document.Parser.Failure; +import net.yacy.document.SentenceReader; import net.yacy.document.WordTokenizer; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; @@ -100,7 +99,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle buffer.append(document.dc_title().toLowerCase()); buffer.append(document.dc_description().toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase()); - final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); + final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib); try { int score = 0; @@ -177,7 +176,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle private static TreeMap getPhrases(final Document document, final int size) { final TreeMap phrases = new TreeMap(); final StringBuilder phrase = new StringBuilder(128); - final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); + final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib); try { StringBuilder token; int count = 0; diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 03390220f..5188d483d 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -20,12 +20,10 @@ package net.yacy.document; -import java.io.ByteArrayInputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; -import java.io.InputStream; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -39,7 +37,6 @@ import java.util.TreeMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.document.language.Identificator; import net.yacy.document.parser.html.ImageEntry; @@ -133,7 +130,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { - createCondensement(document.getText(), meaningLib, doAutotagging); + createCondensement(document.getTextString(), meaningLib, doAutotagging); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle @@ -146,16 +143,15 @@ public final class Condenser { // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) // phrase 99 is taken from the media Link url and anchor description // phrase 100 and above are lines from the text - - insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); - insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); // missing: tags! final String[] titles = document.getSectionTitles(); for (int i = 0; i < titles.length; i++) { - insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); } // anchors: for text indexing we add only the anchor description @@ -180,7 +176,7 @@ public final class Condenser { } // add the URL components to the word list - insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); if (indexMedia) { // add anchor descriptions: here, we also add the url components @@ -188,24 +184,24 @@ public final class Condenser { Iterator> i = document.getAudiolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); } // video i = document.getVideolinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); } // applications i = document.getApplinks().entrySet().iterator(); while (i.hasNext()) { entry = i.next(); - insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); } // images @@ -216,8 +212,8 @@ public final class Condenser { ientry = j.next(); url = ientry.url(); if (url == null) continue; - insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); - insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); + insertTextToWords(new SentenceReader(url.toNormalform(false, false)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); + insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); } // finally check all words for missing flag entry @@ -241,7 +237,7 @@ public final class Condenser { } private void insertTextToWords( - final String text, + final SentenceReader text, final int phrase, final int flagpos, final Bitfield flagstemplate, @@ -250,7 +246,7 @@ public final class Condenser { if (text == null) return; String word; Word wprop; - WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); + WordTokenizer wordenum = new WordTokenizer(text, meaningLib); try { int pip = 0; while (wordenum.hasMoreElements()) { @@ -271,7 +267,7 @@ public final class Condenser { } } - public Condenser(final InputStream text, final WordCache meaningLib, boolean doAutotagging) { + public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) { this.languageIdentificator = null; // we don't need that here // analysis = new Properties(); this.words = new TreeMap(); @@ -295,8 +291,8 @@ public final class Condenser { return this.languageIdentificator.getLanguage(); } - private void createCondensement(final InputStream is, final WordCache meaningLib, boolean doAutotagging) { - assert is != null; + private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) { + assert text != null; final Set currsentwords = new HashSet(); String word = ""; String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; @@ -317,7 +313,7 @@ public final class Condenser { if (LibraryProvider.autotagging.size() == 0) doAutotagging = false; // read source - final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); + final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib); try { while (wordenum.hasMoreElements()) { word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); @@ -430,9 +426,7 @@ public final class Condenser { public static Map getWords(final String text, final WordCache meaningLib) { // returns a word/indexWord relation map if (text == null) return null; - ByteArrayInputStream buffer; - buffer = new ByteArrayInputStream(UTF8.getBytes(text)); - return new Condenser(buffer, meaningLib, false).words(); + return new Condenser(text, meaningLib, false).words(); } public static void main(final String[] args) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index e10f8b44c..d6146c69f 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -133,7 +133,7 @@ public class Document { this.outboundlinks = null; this.languages = languages; this.indexingDenied = indexingDenied; - this.text = text == null ? new ByteArrayOutputStream() : text; + this.text = text == null ? "" : text; } public Object getParserObject() { @@ -299,7 +299,7 @@ dc_rights return this.sections.toArray(new String[this.sections.size()]); } - public InputStream getText() { + public InputStream getTextStream() { try { if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes("")); if (this.text instanceof String) { @@ -322,26 +322,26 @@ dc_rights return new ByteArrayInputStream(UTF8.getBytes("")); } - public byte[] getTextBytes() { + public String getTextString() { try { - if (this.text == null) return new byte[0]; + if (this.text == null) return ""; if (this.text instanceof String) { - return UTF8.getBytes((String) this.text); + return (String) this.text; } else if (this.text instanceof InputStream) { - return FileUtils.read((InputStream) this.text); + return UTF8.String(FileUtils.read((InputStream) this.text)); } else if (this.text instanceof File) { - return FileUtils.read((File) this.text); + return UTF8.String(FileUtils.read((File) this.text)); } else if (this.text instanceof byte[]) { - return (byte[]) this.text; + return UTF8.String((byte[]) this.text); } else if (this.text instanceof ByteArrayOutputStream) { - return ((ByteArrayOutputStream) this.text).toByteArray(); + return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray()); } assert false : this.text.getClass().toString(); return null; } catch (final Exception e) { Log.logException(e); } - return new byte[0]; + return ""; } public long getTextLength() { @@ -367,16 +367,11 @@ dc_rights } public List getSentences(final boolean pre) { - return getSentences(pre, getText()); - } - - public static List getSentences(final boolean pre, final InputStream text) { - if (text == null) return null; - final SentenceReader e = new SentenceReader(text); - e.pre(pre); - final List sentences = new ArrayList(); - while (e.hasNext()) { - sentences.add(e.next()); + final SentenceReader sr = new SentenceReader(getTextString()); + sr.pre(pre); + List sentences = new ArrayList(); + while (sr.hasNext()) { + sentences.add(sr.next()); } return sentences; } @@ -638,7 +633,7 @@ dc_rights if (!(this.text instanceof ByteArrayOutputStream)) { this.text = new ByteArrayOutputStream(); } - FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text); + FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text); this.anchors.putAll(doc.getAnchors()); this.rss.putAll(doc.getRSS()); @@ -707,11 +702,7 @@ dc_rights if (subject != null && subject.length() > 0) os.write("\n"); if (this.text != null) { os.write(" 0) os.write(UTF8.String(buffer, 0, c)); - is.close(); + os.write(getTextString()); os.write("]]>\n"); } final String language = dc_language(); @@ -811,7 +802,7 @@ dc_rights if (doc.getTextLength() > 0) { if (docTextLength > 0) content.write('\n'); try { - docTextLength += FileUtils.copy(doc.getText(), content); + docTextLength += FileUtils.copy(doc.getTextStream(), content); } catch (final IOException e) { Log.logException(e); } diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index 9878a9fb7..97ed8b166 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -24,12 +24,9 @@ package net.yacy.document; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.io.Reader; -import java.io.UnsupportedEncodingException; +import java.io.StringReader; import java.util.Iterator; public class SentenceReader implements Iterator { @@ -37,17 +34,13 @@ public class SentenceReader implements Iterator { // this enumerates StringBuilder objects private StringBuilder buffer; - private BufferedReader raf; + private Reader raf; private int counter = 0; private boolean pre = false; - - public SentenceReader(final InputStream is) { - assert is != null; - try { - raf = new BufferedReader(new InputStreamReader(is, "UTF-8")); - } catch (UnsupportedEncodingException e) { - e.printStackTrace(); - } + + public SentenceReader(final String text) { + assert text != null; + raf = new StringReader(text); buffer = nextElement0(); counter = 0; pre = false; @@ -144,9 +137,8 @@ public class SentenceReader implements Iterator { public synchronized void close() { try { - raf.close(); - } catch(IOException ioe) { - // Ignore IO Exceptions - } + raf.close(); + } catch (IOException e) { + } } } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 0d7356b7e..366cfa4c9 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -178,8 +178,7 @@ public final class TextParser { } finally { if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} } - for (final Document d: docs) { assert d.getText() != null; } // verify docs - + return docs; } @@ -261,7 +260,6 @@ public final class TextParser { if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); - for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); @@ -324,7 +322,7 @@ public final class TextParser { throw new Parser.Failure("All parser failed: " + failedParsers, location); } } - for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs + for (final Document d: docs) { assert d.getTextStream() != null : "mimeType = " + mimeType; } // verify docs return docs; } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 4f5db3833..1644a0046 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -24,15 +24,12 @@ package net.yacy.document; -import java.io.ByteArrayInputStream; -import java.io.InputStream; import java.util.ArrayList; import java.util.Enumeration; import java.util.List; import java.util.SortedMap; import java.util.TreeMap; -import net.yacy.cora.document.UTF8; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.order.Base64Order; @@ -44,9 +41,9 @@ public class WordTokenizer implements Enumeration { private final unsievedWordsEnum e; private final WordCache meaningLib; - public WordTokenizer(final InputStream is, final WordCache meaningLib) { - assert is != null; - this.e = new unsievedWordsEnum(is); + public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) { + assert sr != null; + this.e = new unsievedWordsEnum(sr); this.buffer = nextElement0(); this.meaningLib = meaningLib; } @@ -89,20 +86,20 @@ public class WordTokenizer implements Enumeration { private static class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuilder Objects private StringBuilder buffer = null; - private final SentenceReader e; + private final SentenceReader sr; private final List s; private int sIndex; - public unsievedWordsEnum(final InputStream is) { - assert is != null; - this.e = new SentenceReader(is); + public unsievedWordsEnum(final SentenceReader sr0) { + assert sr0 != null; + this.sr = sr0; this.s = new ArrayList(); this.sIndex = 0; this.buffer = nextElement0(); } public void pre(final boolean x) { - this.e.pre(x); + this.sr.pre(x); } private StringBuilder nextElement0() { @@ -114,8 +111,8 @@ public class WordTokenizer implements Enumeration { this.s.clear(); } while (this.s.isEmpty()) { - if (!this.e.hasNext()) return null; - r = this.e.next(); + if (!this.sr.hasNext()) return null; + r = this.sr.next(); if (r == null) return null; r = trim(r); sb = new StringBuilder(20); @@ -154,7 +151,7 @@ public class WordTokenizer implements Enumeration { } public synchronized void close() { - this.e.close(); + this.sr.close(); } } @@ -183,7 +180,7 @@ public class WordTokenizer implements Enumeration { */ public static SortedMap hashSentence(final String sentence, final WordCache meaningLib, int maxlength) { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); - final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); + final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib); try { int pos = 0; StringBuilder word; diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index 0149047ad..34816611d 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -36,12 +36,10 @@ import java.util.Locale; import java.util.TreeMap; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.UTF8; import net.yacy.document.Document; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; - public class DCEntry extends TreeMap { private static final long serialVersionUID = -2050291583515701559L; @@ -277,7 +275,7 @@ public class DCEntry extends TreeMap { null, "", getLon(), getLat(), - UTF8.getBytes(getDescription()), + getDescription(), null, null, null, diff --git a/source/net/yacy/document/parser/augment/AugmentParser.java b/source/net/yacy/document/parser/augment/AugmentParser.java index 1eb8a5361..a457d7d76 100644 --- a/source/net/yacy/document/parser/augment/AugmentParser.java +++ b/source/net/yacy/document/parser/augment/AugmentParser.java @@ -81,7 +81,7 @@ public class AugmentParser extends AbstractParser implements Parser { all = "yacylatest"; newDoc = new Document(url, mimeType, charset, null, null, null, "", "", - "", null, "", 0, 0, all.getBytes(), null, null, null, false); + "", null, "", 0, 0, all, null, null, null, false); } return newDoc; @@ -94,7 +94,7 @@ public class AugmentParser extends AbstractParser implements Parser { String all = ""; Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "", - "", null, "", 0, 0, all.getBytes(), null, null, null, false); + "", null, "", 0, 0, all, null, null, null, false); Iterator it; diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index ee92b45f2..4c6d22f8a 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -60,28 +60,24 @@ public class csvParser extends AbstractParser implements Parser { for (final String[] row: table) { sb.append(concatRow(row)).append(' '); } - try { - return new Document[]{new Document( - location, - mimeType, - charset, - this, - null, - null, - concatRow(table.get(0)), - "", - "", - null, - null, - 0.0f, 0.0f, - sb.toString().getBytes(charset), - null, - null, - null, - false)}; - } catch (UnsupportedEncodingException e) { - throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location); - } + return new Document[]{new Document( + location, + mimeType, + charset, + this, + null, + null, + concatRow(table.get(0)), + "", + "", + null, + null, + 0.0f, 0.0f, + sb.toString(), + null, + null, + null, + false)}; } private String concatRow(String[] columns) { diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 4ecddf499..f33940d5c 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -28,13 +28,10 @@ package net.yacy.document.parser; import java.io.InputStream; - -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; import net.yacy.kelondro.data.meta.DigestURI; - import org.apache.poi.hwpf.extractor.WordExtractor; public class docParser extends AbstractParser implements Parser { @@ -99,7 +96,7 @@ public class docParser extends AbstractParser implements Parser { null, null, 0.0f, 0.0f, - UTF8.getBytes(contents.toString()), + contents.toString(), null, null, null, diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index a0971bd22..93a8714cc 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -65,9 +65,6 @@ public class genericParser extends AbstractParser implements Parser { null, null, false)}; - for (final Document d: docs) { - assert d.getText() != null : "mimeType = " + mimeType; - } // verify docs return docs; } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 8c3ec03c0..f7e944f6e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -626,12 +626,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { return false; } - public byte[] getText() { + public String getText() { try { - return this.content.getBytes(); + return this.content.toString(); } catch (final OutOfMemoryError e) { Log.logException(e); - return new byte[0]; + return ""; } } diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 6083d280b..ee9e8026b 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -43,7 +43,6 @@ import java.util.Set; import javax.imageio.ImageIO; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -206,7 +205,7 @@ public class genericImageParser extends AbstractParser implements Parser { new String[]{}, // sections description == null ? "" : description, // description 0.0f, 0.0f, // TODO parse location - UTF8.getBytes(infoString), // content text + infoString, // content text anchors, // anchors null, images, diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index e2b6472dc..827567980 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -250,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser { System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); try { // write file - FileUtils.copy(document.getText(), new File("parsedPdf.txt")); + FileUtils.copy(document.getTextStream(), new File("parsedPdf.txt")); } catch (final IOException e) { System.err.println("error saving parsed document"); Log.logException(e); diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 33d66cc07..64cd39090 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -30,7 +30,6 @@ package net.yacy.document.parser; import java.io.BufferedInputStream; import java.io.InputStream; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -95,7 +94,7 @@ public class pptParser extends AbstractParser implements Parser { null, null, 0.0f, 0.0f, - UTF8.getBytes(contents), + contents, null, null, null, diff --git a/source/net/yacy/document/parser/rdfParser.java b/source/net/yacy/document/parser/rdfParser.java index f2b735e56..1bb48bcd0 100644 --- a/source/net/yacy/document/parser/rdfParser.java +++ b/source/net/yacy/document/parser/rdfParser.java @@ -58,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser { String all = "rdfdatasource"; doc = new Document(url, mimeType, charset, null, null, null, "", "", - "", null, "", 0, 0, all.getBytes(), null, null, null, false); + "", null, "", 0, 0, all, null, null, null, false); docs.add(doc); diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index 0b3b9c09c..ee58f68d6 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -142,7 +142,7 @@ public class RDFaParser extends AbstractParser implements Parser { } Document doc = new Document(url, mimeType, charset, null, null, null, "", "", - "", null, "", 0, 0, all.getBytes(), null, null, null, false); + "", null, "", 0, 0, all, null, null, null, false); return doc; } diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index ac7ca6d4b..7b0994b99 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -32,7 +32,6 @@ import java.io.InputStream; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -80,7 +79,7 @@ public class rtfParser extends AbstractParser implements Parser { null, null, 0.0f, 0.0f, - UTF8.getBytes(bodyText), + bodyText, null, null, null, diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index 1e94558d5..70fa053da 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -34,7 +34,6 @@ import java.util.Map; import java.util.Properties; import net.yacy.cora.document.MultiProtocolURI; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -121,7 +120,7 @@ public class swfParser extends AbstractParser implements Parser { sections, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, - UTF8.getBytes(contents), // the parsed document text + contents, // the parsed document text anchors, // a map of extracted anchors null, null, diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 0a9c35634..51c5556ae 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; -import java.io.UnsupportedEncodingException; import java.util.List; import java.util.Map; @@ -94,28 +93,24 @@ public class torrentParser extends AbstractParser implements Parser { if (nameo != null) title = UTF8.String(nameo.getString()); } if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName()); - try { - return new Document[]{new Document( - location, - mimeType, - charset, - this, - null, - null, - title, // title - comment, // author - location.getHost(), - null, - null, - 0.0f, 0.0f, - filenames.toString().getBytes(charset), - null, - null, - null, - false)}; - } catch (UnsupportedEncodingException e) { - throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location); - } + return new Document[]{new Document( + location, + mimeType, + charset, + this, + null, + null, + title, // title + comment, // author + location.getHost(), + null, + null, + 0.0f, 0.0f, + filenames.toString(), + null, + null, + null, + false)}; } public static void main(String[] args) { diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 50f444f72..f18aeb763 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -29,7 +29,6 @@ package net.yacy.document.parser; import java.io.InputStream; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -115,7 +114,7 @@ public class vsdParser extends AbstractParser implements Parser { null, // an array of section headlines abstrct, // an abstract 0.0f, 0.0f, - UTF8.getBytes(contents), // the parsed document text + contents, // the parsed document text null, // a map of extracted anchors null, null, // a treeset of image URLs diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index a1f711b88..2cfe8eb52 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -29,7 +29,6 @@ package net.yacy.document.parser; import java.io.InputStream; -import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -126,7 +125,7 @@ public class xlsParser extends AbstractParser implements Parser { null, null, 0.0f, 0.0f, - UTF8.getBytes(contents), + contents, null, null, null, diff --git a/source/net/yacy/search/snippet/TextSnippet.java b/source/net/yacy/search/snippet/TextSnippet.java index 3fc977ffc..9e5190bd8 100644 --- a/source/net/yacy/search/snippet/TextSnippet.java +++ b/source/net/yacy/search/snippet/TextSnippet.java @@ -24,7 +24,6 @@ package net.yacy.search.snippet; -import java.io.ByteArrayInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; @@ -36,12 +35,12 @@ import java.util.SortedMap; import java.util.regex.Pattern; import net.yacy.cora.document.ASCII; -import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.SentenceReader; import net.yacy.document.SnippetExtractor; import net.yacy.document.WordTokenizer; import net.yacy.document.parser.html.CharacterCoding; @@ -183,7 +182,13 @@ public class TextSnippet implements Comparable, Comparator(); + while (sr.hasNext()) { + sentences.add(sr.next()); + } + if (sentences != null) { try { final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);