refactoring and new usage of SentenceReader: this class appeared as one

of the major CPU users during snippet verification. The class was not
efficient for two reasons:
- it used a too complex input stream; generated from sources and UTF8
byte-conversions. The BufferedReader applied a strong overhead.
- to feed data into the SentenceReader, multiple toString/getBytes had
been applied until a buffered Reader from an input stream was possible.
These superfluous conversions had been removed.
- the best source for the Sentence Reader is a String. Therefore the
production of Strings had been forced inside the Document class.
pull/1/head
orbiter 13 years ago
parent bb8dcb4911
commit 78fc3cf8f8

@ -27,7 +27,6 @@
//javac -classpath .:../Classes Status.java //javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT //if the shell's current path is HTROOT
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Collection; import java.util.Collection;
@ -47,6 +46,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
@ -232,7 +232,7 @@ public class ViewFile {
} }
if (viewMode.equals("parsed")) { if (viewMode.equals("parsed")) {
final String content = UTF8.String(document.getTextBytes()); final String content = document.getTextString();
// content = wikiCode.replaceHTML(content); // added by Marc Nause // content = wikiCode.replaceHTML(content); // added by Marc Nause
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT); prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_title", document.dc_title()); prop.put("viewMode_title", document.dc_title());
@ -284,7 +284,7 @@ public class ViewFile {
for (final StringBuilder s: sentences) { for (final StringBuilder s: sentences) {
sentence = s.toString(); sentence = s.toString();
Enumeration<StringBuilder> tokens = null; Enumeration<StringBuilder> tokens = null;
tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib); tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
while (tokens.hasMoreElements()) { while (tokens.hasMoreElements()) {
token = tokens.nextElement(); token = tokens.nextElement();
if (token.length() > 0) { if (token.length() > 0) {

@ -1,6 +1,5 @@
package de.anomic.data.ymark; package de.anomic.data.ymark;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Arrays; import java.util.Arrays;
@ -11,12 +10,12 @@ import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure; import net.yacy.document.Parser.Failure;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
@ -100,7 +99,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title().toLowerCase()); buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase()); buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try { try {
int score = 0; int score = 0;
@ -177,7 +176,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) { private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>(); final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128); final StringBuilder phrase = new StringBuilder(128);
final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
try { try {
StringBuilder token; StringBuilder token;
int count = 0; int count = 0;

@ -20,12 +20,10 @@
package net.yacy.document; package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
@ -39,7 +37,6 @@ import java.util.TreeMap;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification.ContentDomain; import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator; import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
@ -133,7 +130,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry; Map.Entry<MultiProtocolURI, String> entry;
if (indexText) { if (indexText) {
createCondensement(document.getText(), meaningLib, doAutotagging); createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter: // the phrase counter:
// phrase 0 are words taken from the URL // phrase 0 are words taken from the URL
// phrase 1 is the MainTitle // phrase 1 is the MainTitle
@ -146,16 +143,15 @@ public final class Condenser {
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!) // phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description // phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text // phrase 100 and above are lines from the text
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags! // missing: tags!
final String[] titles = document.getSectionTitles(); final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) { for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
} }
// anchors: for text indexing we add only the anchor description // anchors: for text indexing we add only the anchor description
@ -180,7 +176,7 @@ public final class Condenser {
} }
// add the URL components to the word list // add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) { if (indexMedia) {
// add anchor descriptions: here, we also add the url components // add anchor descriptions: here, we also add the url components
@ -188,24 +184,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator(); Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
} }
// video // video
i = document.getVideolinks().entrySet().iterator(); i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
} }
// applications // applications
i = document.getApplinks().entrySet().iterator(); i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) { while (i.hasNext()) {
entry = i.next(); entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
} }
// images // images
@ -216,8 +212,8 @@ public final class Condenser {
ientry = j.next(); ientry = j.next();
url = ientry.url(); url = ientry.url();
if (url == null) continue; if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib); insertTextToWords(new SentenceReader(url.toNormalform(false, false)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib); insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
} }
// finally check all words for missing flag entry // finally check all words for missing flag entry
@ -241,7 +237,7 @@ public final class Condenser {
} }
private void insertTextToWords( private void insertTextToWords(
final String text, final SentenceReader text,
final int phrase, final int phrase,
final int flagpos, final int flagpos,
final Bitfield flagstemplate, final Bitfield flagstemplate,
@ -250,7 +246,7 @@ public final class Condenser {
if (text == null) return; if (text == null) return;
String word; String word;
Word wprop; Word wprop;
WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
try { try {
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
@ -271,7 +267,7 @@ public final class Condenser {
} }
} }
public Condenser(final InputStream text, final WordCache meaningLib, boolean doAutotagging) { public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
this.languageIdentificator = null; // we don't need that here this.languageIdentificator = null; // we don't need that here
// analysis = new Properties(); // analysis = new Properties();
this.words = new TreeMap<String, Word>(); this.words = new TreeMap<String, Word>();
@ -295,8 +291,8 @@ public final class Condenser {
return this.languageIdentificator.getLanguage(); return this.languageIdentificator.getLanguage();
} }
private void createCondensement(final InputStream is, final WordCache meaningLib, boolean doAutotagging) { private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
assert is != null; assert text != null;
final Set<String> currsentwords = new HashSet<String>(); final Set<String> currsentwords = new HashSet<String>();
String word = ""; String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1]; String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
@ -317,7 +313,7 @@ public final class Condenser {
if (LibraryProvider.autotagging.size() == 0) doAutotagging = false; if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
// read source // read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try { try {
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
@ -430,9 +426,7 @@ public final class Condenser {
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) { public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map // returns a word/indexWord relation map
if (text == null) return null; if (text == null) return null;
ByteArrayInputStream buffer; return new Condenser(text, meaningLib, false).words();
buffer = new ByteArrayInputStream(UTF8.getBytes(text));
return new Condenser(buffer, meaningLib, false).words();
} }
public static void main(final String[] args) { public static void main(final String[] args) {

@ -133,7 +133,7 @@ public class Document {
this.outboundlinks = null; this.outboundlinks = null;
this.languages = languages; this.languages = languages;
this.indexingDenied = indexingDenied; this.indexingDenied = indexingDenied;
this.text = text == null ? new ByteArrayOutputStream() : text; this.text = text == null ? "" : text;
} }
public Object getParserObject() { public Object getParserObject() {
@ -299,7 +299,7 @@ dc_rights
return this.sections.toArray(new String[this.sections.size()]); return this.sections.toArray(new String[this.sections.size()]);
} }
public InputStream getText() { public InputStream getTextStream() {
try { try {
if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes("")); if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes(""));
if (this.text instanceof String) { if (this.text instanceof String) {
@ -322,26 +322,26 @@ dc_rights
return new ByteArrayInputStream(UTF8.getBytes("")); return new ByteArrayInputStream(UTF8.getBytes(""));
} }
public byte[] getTextBytes() { public String getTextString() {
try { try {
if (this.text == null) return new byte[0]; if (this.text == null) return "";
if (this.text instanceof String) { if (this.text instanceof String) {
return UTF8.getBytes((String) this.text); return (String) this.text;
} else if (this.text instanceof InputStream) { } else if (this.text instanceof InputStream) {
return FileUtils.read((InputStream) this.text); return UTF8.String(FileUtils.read((InputStream) this.text));
} else if (this.text instanceof File) { } else if (this.text instanceof File) {
return FileUtils.read((File) this.text); return UTF8.String(FileUtils.read((File) this.text));
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
return (byte[]) this.text; return UTF8.String((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).toByteArray(); return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
} }
assert false : this.text.getClass().toString(); assert false : this.text.getClass().toString();
return null; return null;
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
} }
return new byte[0]; return "";
} }
public long getTextLength() { public long getTextLength() {
@ -367,16 +367,11 @@ dc_rights
} }
public List<StringBuilder> getSentences(final boolean pre) { public List<StringBuilder> getSentences(final boolean pre) {
return getSentences(pre, getText()); final SentenceReader sr = new SentenceReader(getTextString());
} sr.pre(pre);
List<StringBuilder> sentences = new ArrayList<StringBuilder>();
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) { while (sr.hasNext()) {
if (text == null) return null; sentences.add(sr.next());
final SentenceReader e = new SentenceReader(text);
e.pre(pre);
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
} }
return sentences; return sentences;
} }
@ -638,7 +633,7 @@ dc_rights
if (!(this.text instanceof ByteArrayOutputStream)) { if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream(); this.text = new ByteArrayOutputStream();
} }
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text); FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text);
this.anchors.putAll(doc.getAnchors()); this.anchors.putAll(doc.getAnchors());
this.rss.putAll(doc.getRSS()); this.rss.putAll(doc.getRSS());
@ -707,11 +702,7 @@ dc_rights
if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n"); if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n");
if (this.text != null) { if (this.text != null) {
os.write("<dc:description><![CDATA["); os.write("<dc:description><![CDATA[");
final byte[] buffer = new byte[1000]; os.write(getTextString());
int c = 0;
final InputStream is = getText();
while ((c = is.read(buffer)) > 0) os.write(UTF8.String(buffer, 0, c));
is.close();
os.write("]]></dc:description>\n"); os.write("]]></dc:description>\n");
} }
final String language = dc_language(); final String language = dc_language();
@ -811,7 +802,7 @@ dc_rights
if (doc.getTextLength() > 0) { if (doc.getTextLength() > 0) {
if (docTextLength > 0) content.write('\n'); if (docTextLength > 0) content.write('\n');
try { try {
docTextLength += FileUtils.copy(doc.getText(), content); docTextLength += FileUtils.copy(doc.getTextStream(), content);
} catch (final IOException e) { } catch (final IOException e) {
Log.logException(e); Log.logException(e);
} }

@ -24,12 +24,9 @@
package net.yacy.document; package net.yacy.document;
import java.io.BufferedReader;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader; import java.io.Reader;
import java.io.UnsupportedEncodingException; import java.io.StringReader;
import java.util.Iterator; import java.util.Iterator;
public class SentenceReader implements Iterator<StringBuilder> { public class SentenceReader implements Iterator<StringBuilder> {
@ -37,17 +34,13 @@ public class SentenceReader implements Iterator<StringBuilder> {
// this enumerates StringBuilder objects // this enumerates StringBuilder objects
private StringBuilder buffer; private StringBuilder buffer;
private BufferedReader raf; private Reader raf;
private int counter = 0; private int counter = 0;
private boolean pre = false; private boolean pre = false;
public SentenceReader(final InputStream is) { public SentenceReader(final String text) {
assert is != null; assert text != null;
try { raf = new StringReader(text);
raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
buffer = nextElement0(); buffer = nextElement0();
counter = 0; counter = 0;
pre = false; pre = false;
@ -144,9 +137,8 @@ public class SentenceReader implements Iterator<StringBuilder> {
public synchronized void close() { public synchronized void close() {
try { try {
raf.close(); raf.close();
} catch(IOException ioe) { } catch (IOException e) {
// Ignore IO Exceptions }
}
} }
} }

@ -178,8 +178,7 @@ public final class TextParser {
} finally { } finally {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {} if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
} }
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs; return docs;
} }
@ -261,7 +260,6 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try { try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream); final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs
return docs; return docs;
} catch (final Exception e) { } catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location); throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -324,7 +322,7 @@ public final class TextParser {
throw new Parser.Failure("All parser failed: " + failedParsers, location); throw new Parser.Failure("All parser failed: " + failedParsers, location);
} }
} }
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs for (final Document d: docs) { assert d.getTextStream() != null : "mimeType = " + mimeType; } // verify docs
return docs; return docs;
} }

@ -24,15 +24,12 @@
package net.yacy.document; package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.List; import java.util.List;
import java.util.SortedMap; import java.util.SortedMap;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.Base64Order;
@ -44,9 +41,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private final unsievedWordsEnum e; private final unsievedWordsEnum e;
private final WordCache meaningLib; private final WordCache meaningLib;
public WordTokenizer(final InputStream is, final WordCache meaningLib) { public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) {
assert is != null; assert sr != null;
this.e = new unsievedWordsEnum(is); this.e = new unsievedWordsEnum(sr);
this.buffer = nextElement0(); this.buffer = nextElement0();
this.meaningLib = meaningLib; this.meaningLib = meaningLib;
} }
@ -89,20 +86,20 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private static class unsievedWordsEnum implements Enumeration<StringBuilder> { private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects // returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null; private StringBuilder buffer = null;
private final SentenceReader e; private final SentenceReader sr;
private final List<StringBuilder> s; private final List<StringBuilder> s;
private int sIndex; private int sIndex;
public unsievedWordsEnum(final InputStream is) { public unsievedWordsEnum(final SentenceReader sr0) {
assert is != null; assert sr0 != null;
this.e = new SentenceReader(is); this.sr = sr0;
this.s = new ArrayList<StringBuilder>(); this.s = new ArrayList<StringBuilder>();
this.sIndex = 0; this.sIndex = 0;
this.buffer = nextElement0(); this.buffer = nextElement0();
} }
public void pre(final boolean x) { public void pre(final boolean x) {
this.e.pre(x); this.sr.pre(x);
} }
private StringBuilder nextElement0() { private StringBuilder nextElement0() {
@ -114,8 +111,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
this.s.clear(); this.s.clear();
} }
while (this.s.isEmpty()) { while (this.s.isEmpty()) {
if (!this.e.hasNext()) return null; if (!this.sr.hasNext()) return null;
r = this.e.next(); r = this.sr.next();
if (r == null) return null; if (r == null) return null;
r = trim(r); r = trim(r);
sb = new StringBuilder(20); sb = new StringBuilder(20);
@ -154,7 +151,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
} }
public synchronized void close() { public synchronized void close() {
this.e.close(); this.sr.close();
} }
} }
@ -183,7 +180,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/ */
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) { public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
try { try {
int pos = 0; int pos = 0;
StringBuilder word; StringBuilder word;

@ -36,12 +36,10 @@ import java.util.Locale;
import java.util.TreeMap; import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
public class DCEntry extends TreeMap<String, String> { public class DCEntry extends TreeMap<String, String> {
private static final long serialVersionUID = -2050291583515701559L; private static final long serialVersionUID = -2050291583515701559L;
@ -277,7 +275,7 @@ public class DCEntry extends TreeMap<String, String> {
null, null,
"", "",
getLon(), getLat(), getLon(), getLat(),
UTF8.getBytes(getDescription()), getDescription(),
null, null,
null, null,
null, null,

@ -81,7 +81,7 @@ public class AugmentParser extends AbstractParser implements Parser {
all = "yacylatest"; all = "yacylatest";
newDoc = new Document(url, mimeType, charset, null, null, null, "", "", newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
} }
return newDoc; return newDoc;
@ -94,7 +94,7 @@ public class AugmentParser extends AbstractParser implements Parser {
String all = ""; String all = "";
Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "", Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
Iterator<net.yacy.kelondro.blob.Tables.Row> it; Iterator<net.yacy.kelondro.blob.Tables.Row> it;

@ -60,28 +60,24 @@ public class csvParser extends AbstractParser implements Parser {
for (final String[] row: table) { for (final String[] row: table) {
sb.append(concatRow(row)).append(' '); sb.append(concatRow(row)).append(' ');
} }
try { return new Document[]{new Document(
return new Document[]{new Document( location,
location, mimeType,
mimeType, charset,
charset, this,
this, null,
null, null,
null, concatRow(table.get(0)),
concatRow(table.get(0)), "",
"", "",
"", null,
null, null,
null, 0.0f, 0.0f,
0.0f, 0.0f, sb.toString(),
sb.toString().getBytes(charset), null,
null, null,
null, null,
null, false)};
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location);
}
} }
private String concatRow(String[] columns) { private String concatRow(String[] columns) {

@ -28,13 +28,10 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser { public class docParser extends AbstractParser implements Parser {
@ -99,7 +96,7 @@ public class docParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents.toString()), contents.toString(),
null, null,
null, null,
null, null,

@ -65,9 +65,6 @@ public class genericParser extends AbstractParser implements Parser {
null, null,
null, null,
false)}; false)};
for (final Document d: docs) {
assert d.getText() != null : "mimeType = " + mimeType;
} // verify docs
return docs; return docs;
} }
} }

@ -626,12 +626,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false; return false;
} }
public byte[] getText() { public String getText() {
try { try {
return this.content.getBytes(); return this.content.toString();
} catch (final OutOfMemoryError e) { } catch (final OutOfMemoryError e) {
Log.logException(e); Log.logException(e);
return new byte[0]; return "";
} }
} }

@ -43,7 +43,6 @@ import java.util.Set;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -206,7 +205,7 @@ public class genericImageParser extends AbstractParser implements Parser {
new String[]{}, // sections new String[]{}, // sections
description == null ? "" : description, // description description == null ? "" : description, // description
0.0f, 0.0f, // TODO parse location 0.0f, 0.0f, // TODO parse location
UTF8.getBytes(infoString), // content text infoString, // content text
anchors, // anchors anchors, // anchors
null, null,
images, images,

@ -250,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
try { try {
// write file // write file
FileUtils.copy(document.getText(), new File("parsedPdf.txt")); FileUtils.copy(document.getTextStream(), new File("parsedPdf.txt"));
} catch (final IOException e) { } catch (final IOException e) {
System.err.println("error saving parsed document"); System.err.println("error saving parsed document");
Log.logException(e); Log.logException(e);

@ -30,7 +30,6 @@ package net.yacy.document.parser;
import java.io.BufferedInputStream; import java.io.BufferedInputStream;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -95,7 +94,7 @@ public class pptParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), contents,
null, null,
null, null,
null, null,

@ -58,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {
String all = "rdfdatasource"; String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, "", "", doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
docs.add(doc); docs.add(doc);

@ -142,7 +142,7 @@ public class RDFaParser extends AbstractParser implements Parser {
} }
Document doc = new Document(url, mimeType, charset, null, null, null, "", "", Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false); "", null, "", 0, 0, all, null, null, null, false);
return doc; return doc;
} }

@ -32,7 +32,6 @@ import java.io.InputStream;
import javax.swing.text.DefaultStyledDocument; import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit; import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -80,7 +79,7 @@ public class rtfParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(bodyText), bodyText,
null, null,
null, null,
null, null,

@ -34,7 +34,6 @@ import java.util.Map;
import java.util.Properties; import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -121,7 +120,7 @@ public class swfParser extends AbstractParser implements Parser {
sections, // an array of section headlines sections, // an array of section headlines
abstrct, // an abstract abstrct, // an abstract
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text contents, // the parsed document text
anchors, // a map of extracted anchors anchors, // a map of extracted anchors
null, null,
null, null,

@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -94,28 +93,24 @@ public class torrentParser extends AbstractParser implements Parser {
if (nameo != null) title = UTF8.String(nameo.getString()); if (nameo != null) title = UTF8.String(nameo.getString());
} }
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName()); if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
try { return new Document[]{new Document(
return new Document[]{new Document( location,
location, mimeType,
mimeType, charset,
charset, this,
this, null,
null, null,
null, title, // title
title, // title comment, // author
comment, // author location.getHost(),
location.getHost(), null,
null, null,
null, 0.0f, 0.0f,
0.0f, 0.0f, filenames.toString(),
filenames.toString().getBytes(charset), null,
null, null,
null, null,
null, false)};
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location);
}
} }
public static void main(String[] args) { public static void main(String[] args) {

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -115,7 +114,7 @@ public class vsdParser extends AbstractParser implements Parser {
null, // an array of section headlines null, // an array of section headlines
abstrct, // an abstract abstrct, // an abstract
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text contents, // the parsed document text
null, // a map of extracted anchors null, // a map of extracted anchors
null, null,
null, // a treeset of image URLs null, // a treeset of image URLs

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream; import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -126,7 +125,7 @@ public class xlsParser extends AbstractParser implements Parser {
null, null,
null, null,
0.0f, 0.0f, 0.0f, 0.0f,
UTF8.getBytes(contents), contents,
null, null,
null, null,
null, null,

@ -24,7 +24,6 @@
package net.yacy.search.snippet; package net.yacy.search.snippet;
import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
@ -36,12 +35,12 @@ import java.util.SortedMap;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII; import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.SnippetExtractor; import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer; import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding; import net.yacy.document.parser.html.CharacterCoding;
@ -183,7 +182,13 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try the solr text first // try the solr text first
if (solrText != null) { if (solrText != null) {
// compute sentences from solr query // compute sentences from solr query
sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText))); final SentenceReader sr = new SentenceReader(solrText);
sr.pre(pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
if (sentences != null) { if (sentences != null) {
try { try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength); final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);

Loading…
Cancel
Save