added pull request from als plus an NPE fix

pull/1/head
Michael Christen 13 years ago
commit 9cd469e6d6

@ -1,17 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/|api/ymarks/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/|ymarks/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry kind="src" path="htroot/api/ymarks"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/servlet-api.jar"/>

@ -7,7 +7,7 @@ releaseVersion=1.0
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy
releaseNr=$Revision: 8134 $
releaseNr=$Revision: 8135 $
privateKeyFile=private.key
# defining some file/directory access rights

@ -27,6 +27,9 @@ import de.anomic.crawler.retrieval.Response;
public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {
private static final String EMPTY_STRING = new String();
public final static String SPACE = " ";
public final static String POISON = "";
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
@ -83,19 +86,22 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
StringBuilder token;
if(document != null) {
if(document == null) {
return EMPTY_STRING;
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize);
final StringBuilder pwords = new StringBuilder(1000);
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize);
final StringBuilder pwords = new StringBuilder(1000);
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
try {
int score = 0;
// get phrases
@ -163,44 +169,49 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return document.getFileExtension();
}
return clean;
} finally {
tokens.close();
}
return new String();
}
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128);
final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
StringBuilder token;
int count = 0;
// loop through text
while(tokens.hasMoreElements()) {
token = tokens.nextElement();
if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
continue;
final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
try {
StringBuilder token;
int count = 0;
// if we have a full phrase, delete the first token
count++;
if(count > size)
phrase.delete(0, phrase.indexOf(SPACE)+1);
// loop through text
while(tokens.hasMoreElements()) {
token = tokens.nextElement();
if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
continue;
// if we have a full phrase, delete the first token
count++;
if(count > size)
phrase.delete(0, phrase.indexOf(SPACE)+1);
// append new token
if(phrase.length() > 1)
phrase.append(SPACE);
phrase.append(token);
if(count >= size) { // make sure we really have a phrase
if(phrases.containsKey(phrase.toString())) {
phrases.get(phrase.toString()).inc();
} else {
phrases.put(phrase.toString(), new YMarkTag(phrase.toString()));
}
}
}
// append new token
if(phrase.length() > 1)
phrase.append(SPACE);
phrase.append(token);
if(count >= size) { // make sure we really have a phrase
if(phrases.containsKey(phrase.toString())) {
phrases.get(phrase.toString()).inc();
} else {
phrases.put(phrase.toString(), new YMarkTag(phrase.toString()));
}
}
return phrases;
} finally {
tokens.close();
}
return phrases;
}
public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {

@ -235,21 +235,24 @@ public final class Condenser {
if (text == null) return;
String word;
Word wprop;
WordTokenizer wordenum;
wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) this.languageIdentificator.add(word);
if (word.length() < 2) continue;
wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true);
this.words.put(word, wprop);
pip++;
this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++;
WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
try {
int pip = 0;
while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
if (useForLanguageIdentification) this.languageIdentificator.add(word);
if (word.length() < 2) continue;
wprop = this.words.get(word);
if (wprop == null) wprop = new Word(0, pip, phrase);
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
wprop.flags.set(flagpos, true);
this.words.put(word, wprop);
pip++;
this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++;
}
} finally {
wordenum.close();
}
}
@ -296,45 +299,49 @@ public final class Condenser {
// read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence
currsentwords.clear();
wordInSentenceCounter = 1;
} else {
// check index.of detection
if (last_last && comb_indexof && word.equals("modified")) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
allwordcounter++;
currsentwords.add(word);
wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word, wsp);
}
// we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter++;
}
try {
while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
if (word.length() < wordminsize) continue;
// distinguish punctuation and words
wordlen = word.length();
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
// store sentence
currsentwords.clear();
wordInSentenceCounter = 1;
} else {
// check index.of detection
if (last_last && comb_indexof && word.equals("modified")) {
this.RESULT_FLAGS.set(flag_cat_indexof, true);
wordenum.pre(true); // parse lines as they come with CRLF
}
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
last_last = word.equals("last");
last_index = word.equals("index");
// store word
allwordcounter++;
currsentwords.add(word);
wsp = this.words.get(word);
if (wsp != null) {
// word already exists
wordHandle = wsp.posInText;
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
wsp.flags = this.RESULT_FLAGS.clone();
this.words.put(word, wsp);
}
// we now have the unique handle of the word, put it into the sentence:
wordInSentenceCounter++;
}
}
} finally {
wordenum.close();
}
if (pseudostemming) {

@ -141,4 +141,12 @@ public class SentenceReader implements Iterator<StringBuilder> {
public void remove() {
throw new UnsupportedOperationException();
}
public void close() {
try {
raf.close();
} catch(IOException ioe) {
// Ignore IO Exceptions
}
}
}

@ -280,14 +280,21 @@ public final class TextParser {
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) {
ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray);
try {
docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
docs = parser.parse(location, mimeType, documentCharset, bis);
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} finally {
try {
bis.close();
} catch(IOException ioe) {
// Ignore.
}
}
if (docs != null) break;
}

@ -79,6 +79,10 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
if (this.meaningLib != null) WordCache.learn(r);
return r;
}
public void close() {
e.close();
}
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects
@ -145,6 +149,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return r;
}
public void close() {
e.close();
}
}
public static StringBuilder trim(final StringBuilder sb) {
@ -172,23 +179,27 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final Enumeration<StringBuilder> words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
int pos = 0;
StringBuilder word;
byte[] hash;
Integer oldpos;
while (words.hasMoreElements()) {
word = words.nextElement();
hash = Word.word2hash(word);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(hash, oldpos);
}
pos += word.length() + 1;
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
try {
int pos = 0;
StringBuilder word;
byte[] hash;
Integer oldpos;
while (words.hasMoreElements()) {
word = words.nextElement();
hash = Word.word2hash(word);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(hash, oldpos);
}
pos += word.length() + 1;
}
return map;
} finally {
words.close();
}
return map;
}
}

@ -58,7 +58,8 @@ import net.yacy.kelondro.util.MemoryControl;
public class ContentScraper extends AbstractScraper implements Scraper {
private static final String EMPTY_STRING = new String();
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray();
@ -159,7 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>();
this.script = new HashSet<MultiProtocolURI>();
this.title = "";
this.title = EMPTY_STRING;
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
@ -318,14 +319,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", "");
final String src = tagopts.getProperty("src", EMPTY_STRING);
try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
addImage(this.images, ie);
}
}
@ -333,47 +334,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) {
try {
this.root = new MultiProtocolURI(tagopts.getProperty("href", ""));
this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("body")) {
final String c = tagopts.getProperty("class", "");
final String c = tagopts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c);
} else if (tagname.equalsIgnoreCase("div")) {
final String id = tagopts.getProperty("id", "");
final String id = tagopts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id);
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
final String content = tagopts.getProperty("content","");
String name = tagopts.getProperty("name", EMPTY_STRING);
final String content = tagopts.getProperty("content", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
} else {
name = tagopts.getProperty("http-equiv", "");
name = tagopts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
}
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", "");
final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
//String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", EMPTY_STRING);
tagopts.put("nme", areatitle);
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
} else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", "");
final String href = tagopts.getProperty("href", EMPTY_STRING);
final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) {
final String rel = tagopts.getProperty("rel", "");
final String linktitle = tagopts.getProperty("title", "");
final String type = tagopts.getProperty("type", "");
final String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
@ -394,11 +395,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */);
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", "");
final String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) {
mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */);
}
}
@ -409,12 +410,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", "");
final String href = tagopts.getProperty("href", EMPTY_STRING);
MultiProtocolURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFile();
final int p = f.lastIndexOf('.');
final String type = (p < 0) ? "" : f.substring(p + 1);
final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
@ -461,12 +462,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = recursiveParse(text);
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) {
final String src = tagopts.getProperty("src", "");
final String src = tagopts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) {
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
@ -507,7 +508,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
private final static String cleanLine(final String s) {
if (!MemoryControl.request(s.length() * 2, false)) return "";
if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
final StringBuilder sb = new StringBuilder(s.length());
char l = ' ';
char c;
@ -683,27 +684,27 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String getDescription() {
String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description");
if (s == null) return "";
if (s == null) return EMPTY_STRING;
return s;
}
public String getContentType() {
final String s = this.metas.get("content-type");
if (s == null) return "";
if (s == null) return EMPTY_STRING;
return s;
}
public String getAuthor() {
String s = this.metas.get("author");
if (s == null) s = this.metas.get("dc.creator");
if (s == null) return "";
if (s == null) return EMPTY_STRING;
return s;
}
public String getPublisher() {
String s = this.metas.get("copyright");
if (s == null) s = this.metas.get("dc.publisher");
if (s == null) return "";
if (s == null) return EMPTY_STRING;
return s;
}
@ -732,7 +733,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String[] getKeywords() {
String s = this.metas.get("keywords");
if (s == null) s = this.metas.get("dc.description");
if (s == null) s = "";
if (s == null) s = EMPTY_STRING;
if (s.length() == 0) {
return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase());
}
@ -756,13 +757,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String getRefreshPath() {
String s = this.metas.get("refresh");
if (s == null) return "";
if (s == null) return EMPTY_STRING;
final int pos = s.indexOf(';');
if (pos < 0) return "";
if (pos < 0) return EMPTY_STRING;
s = s.substring(pos + 1);
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
return "";
return EMPTY_STRING;
}
// parse location

@ -81,11 +81,22 @@ public final class TransformerWriter extends Writer {
final Scraper scraper,
final Transformer transformer,
final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 1024);
}
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int initialBufferSize
) {
this.outStream = outStream;
this.scraper = scraper;
this.transformer = transformer;
this.buffer = new CharBuffer(1024);
this.buffer = new CharBuffer(initialBufferSize);
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
@ -540,6 +551,7 @@ public final class TransformerWriter extends Writer {
final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
if (this.out != null) this.out.write(filtered);
}
this.buffer.close();
this.buffer = null;
}
final char[] finalized = filterFinalize(quotechar);
@ -550,6 +562,7 @@ public final class TransformerWriter extends Writer {
}
this.filterTag = null;
this.filterOpts = null;
if (this.filterCont != null) this.filterCont.close();
this.filterCont = null;
// if (scraper != null) {scraper.close(); scraper = null;}
// if (transformer != null) {transformer.close(); transformer = null;}

@ -196,7 +196,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, sourceStream.available());
try {
FileUtils.copy(sourceStream, writer, c);
} catch (final IOException e) {

@ -114,17 +114,20 @@ public class odtParser extends AbstractParser implements Parser {
if (entryName.equals("content.xml")) {
// create a writer for output
writer = new CharBuffer();
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
zipFileEntryStream.close();
writer.close();
writer = new CharBuffer((int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
// close readers and writers
zipFileEntryStream.close();
}
} finally {
writer.close();
}
} else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -100,17 +100,21 @@ public class ooxmlParser extends AbstractParser implements Parser {
|| entryName.startsWith("xl/worksheets/sheet")) {
// create a writer for output
writer = new CharBuffer();
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
zipFileEntryStream.close();
writer.close();
writer = new CharBuffer((int)zipEntry.getSize());
try {
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers
} finally {
zipFileEntryStream.close();
}
} finally {
writer.close();
}
} else if (entryName.equals("docProps/core.xml")) {
// meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -898,8 +898,7 @@ public class ArrayStack implements BLOB {
if (!i1.hasNext()) {
if (i2.hasNext()) {
HeapWriter.delete(f1);
if (f2.renameTo(newFile))
return newFile;
if (f2.renameTo(newFile)) return newFile;
return f2;
}
HeapWriter.delete(f1);
@ -907,8 +906,7 @@ public class ArrayStack implements BLOB {
return null;
} else if (!i2.hasNext()) {
HeapWriter.delete(f2);
if (f1.renameTo(newFile))
return newFile;
if (f1.renameTo(newFile)) return newFile;
return f1;
}
assert i1.hasNext();

@ -48,6 +48,8 @@ import net.yacy.kelondro.util.kelondroException;
public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>, Cloneable {
private static final byte[] EMPTY_CACHE = new byte[0];
public static final long growfactorLarge100 = 140L;
public static final long growfactorSmall100 = 120L;
private static final int isortlimit = 20;
@ -77,7 +79,7 @@ public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>,
this.rowdef = rowdef;
this.sortBound = 0;
this.lastTimeWrote = System.currentTimeMillis();
this.chunkcache = new byte[0];
this.chunkcache = EMPTY_CACHE;
this.chunkcount = 0;
}

@ -122,9 +122,9 @@ public final class CharBuffer extends Writer {
return length;
}
private void grow() {
int newsize = buffer.length * 2 + 1;
if (newsize < 32) newsize = 32;
private void grow(int minSize) {
int newsize = buffer.length + 1024;
if (newsize < minSize) newsize = minSize+1;
char[] tmp = new char[newsize];
System.arraycopy(buffer, offset, tmp, 0, length);
buffer = tmp;
@ -136,7 +136,7 @@ public final class CharBuffer extends Writer {
}
public void write(final char b) {
if (offset + length + 1 > buffer.length) grow();
if (offset + length + 1 > buffer.length) grow(offset + length + 1);
buffer[offset + length++] = b;
}
@ -145,7 +145,7 @@ public final class CharBuffer extends Writer {
}
public void write(final char[] bb, final int of, final int le) {
while (offset + length + le > buffer.length) grow();
if (offset + length + le > buffer.length) grow(offset + length + le);
System.arraycopy(bb, of, buffer, offset + length, le);
length += le;
}
@ -476,7 +476,7 @@ public final class CharBuffer extends Writer {
}
public void close() throws IOException {
// TODO Auto-generated method stub
buffer = null; // assist with garbage collection
}
public void flush() throws IOException {

Loading…
Cancel
Save