refactoring and new usage of SentenceReader: this class appeared as one

of the major CPU users during snippet verification. The class was not
efficient for two reasons:
- it used a too complex input stream; generated from sources and UTF8
byte-conversions. The BufferedReader applied a strong overhead.
- to feed data into the SentenceReader, multiple toString/getBytes had
been applied until a buffered Reader from an input stream was possible.
These superfluous conversions had been removed.
- the best source for the Sentence Reader is a String. Therefore the
production of Strings had been forced inside the Document class.
pull/1/head
orbiter 13 years ago
parent bb8dcb4911
commit 78fc3cf8f8

@ -27,7 +27,6 @@
//javac -classpath .:../Classes Status.java
//if the shell's current path is HTROOT
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Collection;
@ -47,6 +46,7 @@ import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.document.parser.html.ImageEntry;
@ -232,7 +232,7 @@ public class ViewFile {
}
if (viewMode.equals("parsed")) {
final String content = UTF8.String(document.getTextBytes());
final String content = document.getTextString();
// content = wikiCode.replaceHTML(content); // added by Marc Nause
prop.put("viewMode", VIEW_MODE_AS_PARSED_TEXT);
prop.put("viewMode_title", document.dc_title());
@ -284,7 +284,7 @@ public class ViewFile {
for (final StringBuilder s: sentences) {
sentence = s.toString();
Enumeration<StringBuilder> tokens = null;
tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), LibraryProvider.dymLib);
tokens = new WordTokenizer(new SentenceReader(sentence), LibraryProvider.dymLib);
while (tokens.hasMoreElements()) {
token = tokens.nextElement();
if (token.length() > 0) {

@ -1,6 +1,5 @@
package de.anomic.data.ymark;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
@ -11,12 +10,12 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure;
import net.yacy.document.SentenceReader;
import net.yacy.document.WordTokenizer;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.word.Word;
@ -100,7 +99,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(buffer.toString()), LibraryProvider.dymLib);
try {
int score = 0;
@ -177,7 +176,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128);
final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
final WordTokenizer tokens = new WordTokenizer(new SentenceReader(document.getTextString()), LibraryProvider.dymLib);
try {
StringBuilder token;
int count = 0;

@ -20,12 +20,10 @@
package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
@ -39,7 +37,6 @@ import java.util.TreeMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
@ -133,7 +130,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getText(), meaningLib, doAutotagging);
createCondensement(document.getTextString(), meaningLib, doAutotagging);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle
@ -146,16 +143,15 @@ public final class Condenser {
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.dc_title(), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_description(), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_creator(), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_publisher(), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(document.dc_subject(' '), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
// missing: tags!
final String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {
insertTextToWords(titles[i], i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
}
// anchors: for text indexing we add only the anchor description
@ -180,7 +176,7 @@ public final class Condenser {
}
// add the URL components to the word list
insertTextToWords(document.dc_source().toNormalform(false, true), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(document.dc_source().toNormalform(false, true)), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
if (indexMedia) {
// add anchor descriptions: here, we also add the url components
@ -188,24 +184,24 @@ public final class Condenser {
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(entry.getValue(), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(false, false)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
}
// images
@ -216,8 +212,8 @@ public final class Condenser {
ientry = j.next();
url = ientry.url();
if (url == null) continue;
insertTextToWords(url.toNormalform(false, false), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
insertTextToWords(new SentenceReader(url.toNormalform(false, false)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
}
// finally check all words for missing flag entry
@ -241,7 +237,7 @@ public final class Condenser {
}
private void insertTextToWords(
final String text,
final SentenceReader text,
final int phrase,
final int flagpos,
final Bitfield flagstemplate,
@ -250,7 +246,7 @@ public final class Condenser {
if (text == null) return;
String word;
Word wprop;
WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
try {
int pip = 0;
while (wordenum.hasMoreElements()) {
@ -271,7 +267,7 @@ public final class Condenser {
}
}
public Condenser(final InputStream text, final WordCache meaningLib, boolean doAutotagging) {
public Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
@ -295,8 +291,8 @@ public final class Condenser {
return this.languageIdentificator.getLanguage();
}
private void createCondensement(final InputStream is, final WordCache meaningLib, boolean doAutotagging) {
assert is != null;
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
assert text != null;
final Set<String> currsentwords = new HashSet<String>();
String word = "";
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
@ -317,7 +313,7 @@ public final class Condenser {
if (LibraryProvider.autotagging.size() == 0) doAutotagging = false;
// read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
try {
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
@ -430,9 +426,7 @@ public final class Condenser {
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
// returns a word/indexWord relation map
if (text == null) return null;
ByteArrayInputStream buffer;
buffer = new ByteArrayInputStream(UTF8.getBytes(text));
return new Condenser(buffer, meaningLib, false).words();
return new Condenser(text, meaningLib, false).words();
}
public static void main(final String[] args) {

@ -133,7 +133,7 @@ public class Document {
this.outboundlinks = null;
this.languages = languages;
this.indexingDenied = indexingDenied;
this.text = text == null ? new ByteArrayOutputStream() : text;
this.text = text == null ? "" : text;
}
public Object getParserObject() {
@ -299,7 +299,7 @@ dc_rights
return this.sections.toArray(new String[this.sections.size()]);
}
public InputStream getText() {
public InputStream getTextStream() {
try {
if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes(""));
if (this.text instanceof String) {
@ -322,26 +322,26 @@ dc_rights
return new ByteArrayInputStream(UTF8.getBytes(""));
}
public byte[] getTextBytes() {
public String getTextString() {
try {
if (this.text == null) return new byte[0];
if (this.text == null) return "";
if (this.text instanceof String) {
return UTF8.getBytes((String) this.text);
return (String) this.text;
} else if (this.text instanceof InputStream) {
return FileUtils.read((InputStream) this.text);
return UTF8.String(FileUtils.read((InputStream) this.text));
} else if (this.text instanceof File) {
return FileUtils.read((File) this.text);
return UTF8.String(FileUtils.read((File) this.text));
} else if (this.text instanceof byte[]) {
return (byte[]) this.text;
return UTF8.String((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).toByteArray();
return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
}
assert false : this.text.getClass().toString();
return null;
} catch (final Exception e) {
Log.logException(e);
}
return new byte[0];
return "";
}
public long getTextLength() {
@ -367,16 +367,11 @@ dc_rights
}
public List<StringBuilder> getSentences(final boolean pre) {
return getSentences(pre, getText());
}
public static List<StringBuilder> getSentences(final boolean pre, final InputStream text) {
if (text == null) return null;
final SentenceReader e = new SentenceReader(text);
e.pre(pre);
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
final SentenceReader sr = new SentenceReader(getTextString());
sr.pre(pre);
List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
return sentences;
}
@ -638,7 +633,7 @@ dc_rights
if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream();
}
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
FileUtils.copy(doc.getTextStream(), (ByteArrayOutputStream) this.text);
this.anchors.putAll(doc.getAnchors());
this.rss.putAll(doc.getRSS());
@ -707,11 +702,7 @@ dc_rights
if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n");
if (this.text != null) {
os.write("<dc:description><![CDATA[");
final byte[] buffer = new byte[1000];
int c = 0;
final InputStream is = getText();
while ((c = is.read(buffer)) > 0) os.write(UTF8.String(buffer, 0, c));
is.close();
os.write(getTextString());
os.write("]]></dc:description>\n");
}
final String language = dc_language();
@ -811,7 +802,7 @@ dc_rights
if (doc.getTextLength() > 0) {
if (docTextLength > 0) content.write('\n');
try {
docTextLength += FileUtils.copy(doc.getText(), content);
docTextLength += FileUtils.copy(doc.getTextStream(), content);
} catch (final IOException e) {
Log.logException(e);
}

@ -24,12 +24,9 @@
package net.yacy.document;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.io.StringReader;
import java.util.Iterator;
public class SentenceReader implements Iterator<StringBuilder> {
@ -37,17 +34,13 @@ public class SentenceReader implements Iterator<StringBuilder> {
// this enumerates StringBuilder objects
private StringBuilder buffer;
private BufferedReader raf;
private Reader raf;
private int counter = 0;
private boolean pre = false;
public SentenceReader(final InputStream is) {
assert is != null;
try {
raf = new BufferedReader(new InputStreamReader(is, "UTF-8"));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
public SentenceReader(final String text) {
assert text != null;
raf = new StringReader(text);
buffer = nextElement0();
counter = 0;
pre = false;
@ -144,9 +137,8 @@ public class SentenceReader implements Iterator<StringBuilder> {
public synchronized void close() {
try {
raf.close();
} catch(IOException ioe) {
// Ignore IO Exceptions
}
raf.close();
} catch (IOException e) {
}
}
}

@ -178,8 +178,7 @@ public final class TextParser {
} finally {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
}
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
@ -261,7 +260,6 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (final Document d: docs) { assert d != null && d.getText() != null; } // verify docs
return docs;
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
@ -324,7 +322,7 @@ public final class TextParser {
throw new Parser.Failure("All parser failed: " + failedParsers, location);
}
}
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
for (final Document d: docs) { assert d.getTextStream() != null : "mimeType = " + mimeType; } // verify docs
return docs;
}

@ -24,15 +24,12 @@
package net.yacy.document;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.order.Base64Order;
@ -44,9 +41,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private final unsievedWordsEnum e;
private final WordCache meaningLib;
public WordTokenizer(final InputStream is, final WordCache meaningLib) {
assert is != null;
this.e = new unsievedWordsEnum(is);
public WordTokenizer(final SentenceReader sr, final WordCache meaningLib) {
assert sr != null;
this.e = new unsievedWordsEnum(sr);
this.buffer = nextElement0();
this.meaningLib = meaningLib;
}
@ -89,20 +86,20 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null;
private final SentenceReader e;
private final SentenceReader sr;
private final List<StringBuilder> s;
private int sIndex;
public unsievedWordsEnum(final InputStream is) {
assert is != null;
this.e = new SentenceReader(is);
public unsievedWordsEnum(final SentenceReader sr0) {
assert sr0 != null;
this.sr = sr0;
this.s = new ArrayList<StringBuilder>();
this.sIndex = 0;
this.buffer = nextElement0();
}
public void pre(final boolean x) {
this.e.pre(x);
this.sr.pre(x);
}
private StringBuilder nextElement0() {
@ -114,8 +111,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
this.s.clear();
}
while (this.s.isEmpty()) {
if (!this.e.hasNext()) return null;
r = this.e.next();
if (!this.sr.hasNext()) return null;
r = this.sr.next();
if (r == null) return null;
r = trim(r);
sb = new StringBuilder(20);
@ -154,7 +151,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
}
public synchronized void close() {
this.e.close();
this.sr.close();
}
}
@ -183,7 +180,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
final WordTokenizer words = new WordTokenizer(new SentenceReader(sentence), meaningLib);
try {
int pos = 0;
StringBuilder word;

@ -36,12 +36,10 @@ import java.util.Locale;
import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
public class DCEntry extends TreeMap<String, String> {
private static final long serialVersionUID = -2050291583515701559L;
@ -277,7 +275,7 @@ public class DCEntry extends TreeMap<String, String> {
null,
"",
getLon(), getLat(),
UTF8.getBytes(getDescription()),
getDescription(),
null,
null,
null,

@ -81,7 +81,7 @@ public class AugmentParser extends AbstractParser implements Parser {
all = "yacylatest";
newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
"", null, "", 0, 0, all, null, null, null, false);
}
return newDoc;
@ -94,7 +94,7 @@ public class AugmentParser extends AbstractParser implements Parser {
String all = "";
Document newDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
"", null, "", 0, 0, all, null, null, null, false);
Iterator<net.yacy.kelondro.blob.Tables.Row> it;

@ -60,28 +60,24 @@ public class csvParser extends AbstractParser implements Parser {
for (final String[] row: table) {
sb.append(concatRow(row)).append(' ');
}
try {
return new Document[]{new Document(
location,
mimeType,
charset,
this,
null,
null,
concatRow(table.get(0)),
"",
"",
null,
null,
0.0f, 0.0f,
sb.toString().getBytes(charset),
null,
null,
null,
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location);
}
return new Document[]{new Document(
location,
mimeType,
charset,
this,
null,
null,
concatRow(table.get(0)),
"",
"",
null,
null,
0.0f, 0.0f,
sb.toString(),
null,
null,
null,
false)};
}
private String concatRow(String[] columns) {

@ -28,13 +28,10 @@
package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.kelondro.data.meta.DigestURI;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Parser {
@ -99,7 +96,7 @@ public class docParser extends AbstractParser implements Parser {
null,
null,
0.0f, 0.0f,
UTF8.getBytes(contents.toString()),
contents.toString(),
null,
null,
null,

@ -65,9 +65,6 @@ public class genericParser extends AbstractParser implements Parser {
null,
null,
false)};
for (final Document d: docs) {
assert d.getText() != null : "mimeType = " + mimeType;
} // verify docs
return docs;
}
}

@ -626,12 +626,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return false;
}
public byte[] getText() {
public String getText() {
try {
return this.content.getBytes();
return this.content.toString();
} catch (final OutOfMemoryError e) {
Log.logException(e);
return new byte[0];
return "";
}
}

@ -43,7 +43,6 @@ import java.util.Set;
import javax.imageio.ImageIO;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -206,7 +205,7 @@ public class genericImageParser extends AbstractParser implements Parser {
new String[]{}, // sections
description == null ? "" : description, // description
0.0f, 0.0f, // TODO parse location
UTF8.getBytes(infoString), // content text
infoString, // content text
anchors, // anchors
null,
images,

@ -250,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
try {
// write file
FileUtils.copy(document.getText(), new File("parsedPdf.txt"));
FileUtils.copy(document.getTextStream(), new File("parsedPdf.txt"));
} catch (final IOException e) {
System.err.println("error saving parsed document");
Log.logException(e);

@ -30,7 +30,6 @@ package net.yacy.document.parser;
import java.io.BufferedInputStream;
import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -95,7 +94,7 @@ public class pptParser extends AbstractParser implements Parser {
null,
null,
0.0f, 0.0f,
UTF8.getBytes(contents),
contents,
null,
null,
null,

@ -58,7 +58,7 @@ public class rdfParser extends AbstractParser implements Parser {
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
"", null, "", 0, 0, all, null, null, null, false);
docs.add(doc);

@ -142,7 +142,7 @@ public class RDFaParser extends AbstractParser implements Parser {
}
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
"", null, "", 0, 0, all, null, null, null, false);
return doc;
}

@ -32,7 +32,6 @@ import java.io.InputStream;
import javax.swing.text.DefaultStyledDocument;
import javax.swing.text.rtf.RTFEditorKit;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -80,7 +79,7 @@ public class rtfParser extends AbstractParser implements Parser {
null,
null,
0.0f, 0.0f,
UTF8.getBytes(bodyText),
bodyText,
null,
null,
null,

@ -34,7 +34,6 @@ import java.util.Map;
import java.util.Properties;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -121,7 +120,7 @@ public class swfParser extends AbstractParser implements Parser {
sections, // an array of section headlines
abstrct, // an abstract
0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text
contents, // the parsed document text
anchors, // a map of extracted anchors
null,
null,

@ -28,7 +28,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.List;
import java.util.Map;
@ -94,28 +93,24 @@ public class torrentParser extends AbstractParser implements Parser {
if (nameo != null) title = UTF8.String(nameo.getString());
}
if (title == null || title.length() == 0) title = MultiProtocolURI.unescape(location.getFileName());
try {
return new Document[]{new Document(
location,
mimeType,
charset,
this,
null,
null,
title, // title
comment, // author
location.getHost(),
null,
null,
0.0f, 0.0f,
filenames.toString().getBytes(charset),
null,
null,
null,
false)};
} catch (UnsupportedEncodingException e) {
throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location);
}
return new Document[]{new Document(
location,
mimeType,
charset,
this,
null,
null,
title, // title
comment, // author
location.getHost(),
null,
null,
0.0f, 0.0f,
filenames.toString(),
null,
null,
null,
false)};
}
public static void main(String[] args) {

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -115,7 +114,7 @@ public class vsdParser extends AbstractParser implements Parser {
null, // an array of section headlines
abstrct, // an abstract
0.0f, 0.0f,
UTF8.getBytes(contents), // the parsed document text
contents, // the parsed document text
null, // a map of extracted anchors
null,
null, // a treeset of image URLs

@ -29,7 +29,6 @@ package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -126,7 +125,7 @@ public class xlsParser extends AbstractParser implements Parser {
null,
null,
0.0f, 0.0f,
UTF8.getBytes(contents),
contents,
null,
null,
null,

@ -24,7 +24,6 @@
package net.yacy.search.snippet;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
@ -36,12 +35,12 @@ import java.util.SortedMap;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.SentenceReader;
import net.yacy.document.SnippetExtractor;
import net.yacy.document.WordTokenizer;
import net.yacy.document.parser.html.CharacterCoding;
@ -183,7 +182,13 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
// try the solr text first
if (solrText != null) {
// compute sentences from solr query
sentences = Document.getSentences(pre, new ByteArrayInputStream(UTF8.getBytes(solrText)));
final SentenceReader sr = new SentenceReader(solrText);
sr.pre(pre);
sentences = new ArrayList<StringBuilder>();
while (sr.hasNext()) {
sentences.add(sr.next());
}
if (sentences != null) {
try {
final SnippetExtractor tsr = new SnippetExtractor(sentences, remainingHashes, snippetMaxLength);

Loading…
Cancel
Save