refactoring and new usage of SentenceReader: this class appeared as one

of the major CPU users during snippet verification. The class was not
efficient for two reasons:
- it used a too complex input stream; generated from sources and UTF8
byte-conversions. The BufferedReader applied a strong overhead.
- to feed data into the SentenceReader, multiple toString/getBytes had
been applied until a buffered Reader from an input stream was possible.
These superfluous conversions had been removed.
- the best source for the Sentence Reader is a String. Therefore the
production of Strings had been forced inside the Document class.
pull/1/head
orbiter 13 years ago
parent bb8dcb4911
commit 78fc3cf8f8

@ -27,7 +27,6 @@
//javac -classpath .:../Classes Status.java //javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT //if the shell's current path is HTROOT
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection; import java.util.Collection;
@ -47,6 +46,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
@ -232,7 +232,7 @@ public class ViewFile {
} }
if (viewMode.equals("parsed")) { if (viewMode.equals("parsed")) {
final String content = UTF8.String(document.getTextBytes()); final String content = document.getTextString();
// content = wikiCode.replaceHTML(content); // added by Marc Nause // content = wikiCode.replaceHTML(content); // added by Marc Nause
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT); prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_title", document.dc_title()); prop.put("viewMode_title", document.dc_title());
@ -284,7 +284,7 @@ public class ViewFile {
for (final StringBuilder s: sentences) { for (final StringBuilder s: sentences) {
sentence = s.toString(); sentence = s.toString();
Enumeration<StringBuilder> tokens = null; Enumeration<StringBuilder> tokens = null;
tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib); tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
while (tokens.hasMoreElements()) { while (tokens.hasMoreElements()) {
token = tokens.nextElement(); token = tokens.nextElement();
if (token.length() > 0) { if (token.length() > 0) {

@ -1,6 +1,5 @@
package de.anomic.data.ymark; package de.anomic.data.ymark;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Arrays; import java.util.Arrays;
@ -11,12 +10,12 @@ import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure; import net.yacy.document.Parser.Failure;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
@ -100,7 +99,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title().toLowerCase()); buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase()); buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try { try {
int score = 0; int score = 0;
@ -177,7 +176,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) { private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>(); final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128); final StringBuilder phrase = new StringBuilder(128);
final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
try { try {
StringBuilder token; StringBuilder token;
int count = 0; int count = 0;

@ -20,12 +20,10 @@
package net.yacy.document; package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -39,7 +37,6 @@ import java.util.TreeMap;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator; import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
@ -133,7 +130,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
if (indexText) { if (indexText) {
createCondensement(document.getText(), meaningLib, doAutotagging); createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL
// phrase 1 is the MainTitle // phrase 1 is the MainTitle
@ -146,16 +143,15 @@ public final class Condenser {
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description // phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text // phrase 100 and above are lines from the text
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags! // missing: tags!
final String[] titles = document.getSectionTitles(); final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) { for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
} }
// anchors: for text indexing we add only the anchor description // anchors: for text indexing we add only the anchor description
@ -180,7 +176,7 @@ public final class Condenser {
} }
// add the URL components to the word list // add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) { if (indexMedia) {
// add anchor descriptions: here, we also add the url components // add anchor descriptions: here, we also add the url components
@ -188,24 +184,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator(); Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
} }
// video // video
i = document.getVideolinks().entrySet().iterator(); i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
} }
// applications // applications
i = document.getApplinks().entrySet().iterator(); i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
} }
// images // images
@ -216,8 +212,8 @@ public final class Condenser {
ientry = j.next(); ientry = j.next();
url = ientry.url(); url = ientry.url();
if (url == null) continue; if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(url.toNormalform(false, false)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
} }
// finally check all words for missing flag entry // finally check all words for missing flag entry
@ -241,7 +237,7 @@ public final class Condenser {
} }
private void insertTextToWords( private void insertTextToWords(
final String text, final SentenceReader text,
final int phrase, final int phrase,
final int flagpos, final int flagpos,
final Bitfield flagstemplate, final Bitfield flagstemplate,
@ -250,7 +246,7 @@ public final class Condenser {
if (text == null) return; if (text == null) return;
String word; String word;
Word wprop; Word wprop;
WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
try { try {
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
@ -271,7 +267,7 @@ public final class Condenser {
} }
} }
public Condenser(final InputStream text, final WordCache meaningLib, boolean doAutotagging) { public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
this.languageIdentificator = null; // we don't need that here this.languageIdentificator = null; // we don't need that here
// analysis = new Properties(); // analysis = new Properties();
this.words = new TreeMap<String, Word>(); this.words = new TreeMap<String, Word>();
@ -295,8 +291,8 @@ public final class Condenser {
return this.languageIdentificator.getLanguage(); return this.languageIdentificator.getLanguage();
} }
private void createCondensement(final InputStream is, final WordCache meaningLib, boolean doAutotagging) { private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
assert is != null; assert text != null;
final Set<String> currsentwords = new HashSet<String>(); final Set<String> currsentwords = new HashSet<String>();
String word = ""; String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
@ -317,7 +313,7 @@ public final class Condenser {
if (LibraryProvider.autotagging.size() == 0) doAutotagging = false; if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
// read source // read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try { try {
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
@ -430,9 +426,7 @@ public final class Condenser {
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) { public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map // returns a word/indexWord relation map
if (text == null) return null; if (text == null) return null;
ByteArrayInputStream buffer; return new Condenser(text, meaningLib, false).words();
buffer = new ByteArrayInputStream(UTF8.getBytes(text));
return new Condenser(buffer, meaningLib, false).words();
} }
public static void main(final String[] args) { public static void main(final String[] args) {

@ -133,7 +133,7 @@ public class Document {
this.outboundlinks = null; this.outboundlinks = null;
this.languages = languages; this.languages = languages;
this.indexingDenied = indexingDenied; this.indexingDenied = indexingDenied;
this.text = text == null ? new ByteArrayOutputStream() : text; this.text = text == null ? "" : text;
} }
public Object getParserObject() { public Object getParserObject() {
@ -299,7 +299,7 @@ dc_rights
return this.sections.toArray(new String[this.sections.size()]); return this.sections.toArray(new String[this.sections.size()]);
} }
public InputStream getText() { public InputStream getTextStream() {
try { try {
if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes("")); if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes(""));
if (this.text instanceof String) { if (this.text instanceof String) {
@ -322,26 +322,26 @@ dc_rights
return new ByteArrayInputStream(UTF8.getBytes("")); return new ByteArrayInputStream(UTF8.getBytes(""));
} }
public byte[] getTextBytes() { public String getTextString() {
try { try {
if (this.text == null) return new byte[0]; if (this.text == null) return "";
if (this.text instanceof String) { if (this.text instanceof String) {
return UTF8.getBytes((String) this.text); return (String) this.text;
} else if (this.text instanceof InputStream) { } else if (this.text instanceof InputStream) {
return FileUtils.read((InputStream) this.text); return UTF8.String(FileUtils.read((InputStream) this.text));
} else if (this.text instanceof File) { } else if (this.text instanceof File) {
return FileUtils.read((File) this.text); return UTF8.String(FileUtils.read((File) this.text));
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
return (byte[]) this.text; return UTF8.String((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).toByteArray(); return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
} }
assert false : this.text.getClass().toString(); assert false : this.text.getClass().toString();
return null; return null;
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
} }
return new byte[0]; return "";
} }
public long getTextLength() { public long getTextLength() {
@ -367,16 +367,11 @@ dc_rights
} }
public List<StringBuilder> getSentences(final boolean pre) { public List<StringBuilder> getSentences(final boolean pre) {
return getSentences(pre, getText()); final SentenceReader sr = new SentenceReader(getTextString());
} sr.pre(pre);
List<StringBuilder> sentences = new ArrayList<StringBuilder>();
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) { while (sr.hasNext()) {
if (text == null) return null; sentences.add(sr.next());
final SentenceReader e = new SentenceReader(text);
e.pre(pre);
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
} }
return sentences; return sentences;
} }
@ -638,7 +633,7 @@ dc_rights
if (!(this.text instanceof ByteArrayOutputStream)) { if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream(); this.text = new ByteArrayOutputStream();
} }
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text); FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text);
this.anchors.putAll(doc.getAnchors()); this.anchors.putAll(doc.getAnchors());
this.rss.putAll(doc.getRSS()); this.rss.putAll(doc.getRSS());
@ -707,11 +702,7 @@ dc_rights
if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n"); if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n");
if (this.text != null) { if (this.text != null) {
os.write("<dc:description><![CDATA["); os.write("<dc:description><![CDATA[");
final byte[] buffer = new byte[1000]; os.write(getTextString());
int c = 0;
final InputStream is = getText();
while ((c = is.read(buffer)) > 0) os.write(UTF8.String(buffer, 0, c));
is.close();
os.write("]]></dc:description>\n"); os.write("]]></dc:description>\n");
} }
final String language = dc_language(); final String language = dc_language();
@ -811,7 +802,7 @@ dc_rights
if (doc.getTextLength() > 0) { if (doc.getTextLength() > 0) {
if (docTextLength > 0) content.write('\n'); if (docTextLength > 0) content.write('\n');
try { try {
docTextLength += FileUtils.copy(doc.getText(), content); docTextLength += FileUtils.copy(doc.getTextStream(), content);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} }

@ -24,12 +24,9 @@
package net.yacy.document; package net.yacy.document;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.StringReader;
import java.util.Iterator; import java.util.Iterator;
public class SentenceReader implements Iterator<StringBuilder> { public class SentenceReader implements Iterator<StringBuilder> {
@ -37,17 +34,13 @@ public class SentenceReader implements Iterator<StringBuilder> {
// this enumerates StringBuilder objects // this enumerates StringBuilder objects
private StringBuilder buffer; private StringBuilder buffer;
private BufferedReader raf; private Reader raf;
private int counter = 0; private int counter = 0;
private boolean pre = false; private boolean pre = false;
public SentenceReader(final InputStream is) { public SentenceReader(final String text) {
assert is != null; assert text != null;
try { raf = new StringReader(text);
raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
buffer = nextElement0(); buffer = nextElement0();
counter = 0; counter = 0;
pre = false; pre = false;
@ -145,8 +138,7 @@ public class SentenceReader implements Iterator<StringBuilder> {
public synchronized void close() { public synchronized void close() {
try { try {
raf.close(); raf.close();
} catch(IOException ioe) { } catch (IOException e) {
// Ignore IO Exceptions
} }
} }
} }

@ -178,7 +178,6 @@ public final class TextParser {
} finally { } finally {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
} }
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs; return docs;
} }
@ -261,7 +260,6 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try { try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs
return docs; return docs;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -324,7 +322,7 @@ public final class TextParser {
throw new Parser.Failure("All parser failed: " + failedParsers, location); throw new Parser.Failure("All parser failed: " + failedParsers, location);
} }
} }
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs for (final Document d: docs) { assert d.getTextStream() != null : "mimeType = " + mimeType; } // verify docs
return docs; return docs;
} }

@ -24,15 +24,12 @@
package net.yacy.document; package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.List; import java.util.List;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
@ -44,9 +41,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private final unsievedWordsEnum e; private final unsievedWordsEnum e;
private final WordCache meaningLib; private final WordCache meaningLib;
public WordTokenizer(final InputStream is, final WordCache meaningLib) { public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) {
assert is != null; assert sr != null;
this.e = new unsievedWordsEnum(is); this.e = new unsievedWordsEnum(sr);
this.buffer = nextElement0(); this.buffer = nextElement0();
this.meaningLib = meaningLib; this.meaningLib = meaningLib;
} }
@ -89,20 +86,20 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private static class unsievedWordsEnum implements Enumeration<StringBuilder> { private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects // returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null; private StringBuilder buffer = null;
private final SentenceReader e; private final SentenceReader sr;
private final List<StringBuilder> s; private final List<StringBuilder> s;
private int sIndex; private int sIndex;
public unsievedWordsEnum(final InputStream is) { public unsievedWordsEnum(final SentenceReader sr0) {
assert is != null; assert sr0 != null;
this.e = new SentenceReader(is); this.sr = sr0;
this.s = new ArrayList<StringBuilder>(); this.s = new ArrayList<StringBuilder>();
this.sIndex = 0; this.sIndex = 0;
this.buffer = nextElement0(); this.buffer = nextElement0();
} }
public void pre(final boolean x) { public void pre(final boolean x) {
this.e.pre(x); this.sr.pre(x);
} }
private StringBuilder nextElement0() { private StringBuilder nextElement0() {
@ -114,8 +111,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
this.s.clear(); this.s.clear();
} }
while (this.s.isEmpty()) { while (this.s.isEmpty()) {
if (!this.e.hasNext()) return null; if (!this.sr.hasNext()) return null;
r = this.e.next(); r = this.sr.next();
if (r == null) return null; if (r == null) return null;
r = trim(r); r = trim(r);
sb = new StringBuilder(20); sb = new StringBuilder(20);
@ -154,7 +151,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
} }
public synchronized void close() { public synchronized void close() {
this.e.close(); this.sr.close();
} }
} }
@ -183,7 +180,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/ */
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) { public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
try { try {
int pos = 0; int pos = 0;
StringBuilder word; StringBuilder word;

@ -36,12 +36,10 @@ import java.util.Locale;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
public class DCEntry extends TreeMap<String, String> { public class DCEntry extends TreeMap<String, String> {
private static final long serialVersionUID = -2050291583515701559L; private static final long serialVersionUID = -2050291583515701559L;
@ -277,7 +275,7 @@ public class DCEntry extends TreeMap<String, String> {
null, null,
"", "",
getLon(), getLat(), getLon(), getLat(),
UTF8.getBytes(getDescription()), getDescription(),
null, null,
null, null,
null, null,

@ -81,7 +81,7 @@ public class AugmentParser extends AbstractParser implements Parser {
all = "yacylatest"; all = "yacylatest";
newDoc = new Document(url, mimeType, charset, null, null, null, "", "", newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
} }
return newDoc; return newDoc;
@ -94,7 +94,7 @@ public class AugmentParser extends AbstractParser implements Parser {
String all = ""; String all = "";
Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "", Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
Iterator<net.yacy.kelondro.blob.Tables.Row> it; Iterator<net.yacy.kelondro.blob.Tables.Row> it;

@ -60,7 +60,6 @@ public class csvParser extends AbstractParser implements Parser {
for (final String[] row: table) { for (final String[] row: table) {
sb.append(concatRow(row)).append(' '); sb.append(concatRow(row)).append(' ');
} }
try {
return new Document[]{new Document( return new Document[]{new Document(
location, location,
mimeType, mimeType,
@ -74,14 +73,11 @@ public class csvParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
sb.toString().getBytes(charset), sb.toString(),
null, null,
null, null,
null, null,
false)}; false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location);
}
} }
private String concatRow(String[] columns) { private String concatRow(String[] columns) {

@ -28,13 +28,10 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser { public class docParser extends AbstractParser implements Parser {
@ -99,7 +96,7 @@ public class docParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents.toString()), contents.toString(),
null, null,
null, null,
null, null,

@ -65,9 +65,6 @@ public class genericParser extends AbstractParser implements Parser {
null, null,
null, null,
false)}; false)};
for (final Document d: docs) {
assert d.getText() != null : "mimeType = " + mimeType;
} // verify docs
return docs; return docs;
} }
} }

@ -626,12 +626,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false; return false;
} }
public byte[] getText() { public String getText() {
try { try {
return this.content.getBytes(); return this.content.toString();
} catch (final OutOfMemoryError e) { } catch (final OutOfMemoryError e) {
Log.logException(e); Log.logException(e);
return new byte[0]; return "";
} }
} }

@ -43,7 +43,6 @@ import java.util.Set;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -206,7 +205,7 @@ public class genericImageParser extends AbstractParser implements Parser {
new String[]{}, // sections new String[]{}, // sections
description == null ? "" : description, // description description == null ? "" : description, // description
0.0f, 0.0f, // TODO parse location 0.0f, 0.0f, // TODO parse location
UTF8.getBytes(infoString), // content text infoString, // content text
anchors, // anchors anchors, // anchors
null, null,
images, images,

@ -250,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
try { try {
// write file // write file
FileUtils.copy(document.getText(), new File("parsedPdf.txt")); FileUtils.copy(document.getTextStream(), new File("parsedPdf.txt"));
} catch (final IOException e) { } catch (final IOException e) {
System.err.println("error saving parsed document"); System.err.println("error saving parsed document");
Log.logException(e); Log.logException(e);

@ -30,7 +30,6 @@ package net.yacy.document.parser;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -95,7 +94,7 @@ public class pptParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), contents,
null, null,
null, null,
null, null,

@ -58,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {
String all = "rdfdatasource"; String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, "", "", doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
docs.add(doc); docs.add(doc);

@ -142,7 +142,7 @@ public class RDFaParser extends AbstractParser implements Parser {
} }
Document doc = new Document(url, mimeType, charset, null, null, null, "", "", Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
return doc; return doc;
} }

@ -32,7 +32,6 @@ import java.io.InputStream;
import javax.swing.text.DefaultStyledDocument; import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit; import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -80,7 +79,7 @@ public class rtfParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(bodyText), bodyText,
null, null,
null, null,
null, null,

@ -34,7 +34,6 @@ import java.util.Map;
import java.util.Properties; import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -121,7 +120,7 @@ public class swfParser extends AbstractParser implements Parser {
sections, // an array of section headlines sections, // an array of section headlines
abstrct, // an abstract abstrct, // an abstract
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text contents, // the parsed document text
anchors, // a map of extracted anchors anchors, // a map of extracted anchors
null, null,
null, null,

@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -94,7 +93,6 @@ public class torrentParser extends AbstractParser implements Parser {
if (nameo != null) title = UTF8.String(nameo.getString()); if (nameo != null) title = UTF8.String(nameo.getString());
} }
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName()); if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
try {
return new Document[]{new Document( return new Document[]{new Document(
location, location,
mimeType, mimeType,
@ -108,14 +106,11 @@ public class torrentParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
filenames.toString().getBytes(charset), filenames.toString(),
null, null,
null, null,
null, null,
false)}; false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location);
}
} }
public static void main(String[] args) { public static void main(String[] args) {

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -115,7 +114,7 @@ public class vsdParser extends AbstractParser implements Parser {
null, // an array of section headlines null, // an array of section headlines
abstrct, // an abstract abstrct, // an abstract
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text contents, // the parsed document text
null, // a map of extracted anchors null, // a map of extracted anchors
null, null,
null, // a treeset of image URLs null, // a treeset of image URLs

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -126,7 +125,7 @@ public class xlsParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), contents,
null, null,
null, null,
null, null,

@ -24,7 +24,6 @@
package net.yacy.search.snippet; package net.yacy.search.snippet;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
@ -36,12 +35,12 @@ import java.util.SortedMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.SnippetExtractor; import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -183,7 +182,13 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try the solr text first // try the solr text first
if (solrText != null) { if (solrText != null) {
// compute sentences from solr query // compute sentences from solr query
sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText))); final SentenceReader sr = new SentenceReader(solrText);
sr.pre(pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
if (sentences != null) { if (sentences != null) {
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);

Loading…
Cancel
Save