diff --git a/.classpath b/.classpath index 5ba60f07e..4e29a2801 100644 --- a/.classpath +++ b/.classpath @@ -1,17 +1,6 @@ - - - - - - - - - - - diff --git a/build.properties b/build.properties index 25ea97548..6ec092905 100644 --- a/build.properties +++ b/build.properties @@ -7,7 +7,7 @@ releaseVersion=1.0 stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz releaseFileParentDir=yacy -releaseNr=$Revision: 8134 $ +releaseNr=$Revision: 8135 $ privateKeyFile=private.key # defining some file/directory access rights diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index c62d660aa..1176836a8 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -27,6 +27,9 @@ import de.anomic.crawler.retrieval.Response; public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler { + private static final String EMPTY_STRING = new String(); + + public final static String SPACE = " "; public final static String POISON = ""; public final static HashSet stopwords = new HashSet(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo", @@ -83,19 +86,22 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle final TreeSet topwords = new TreeSet(); StringBuilder token; - if(document != null) { + if(document == null) { + return EMPTY_STRING; + } - //get words from document - final Map words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); - - // generate potential tags from document title, description and subject - final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; - final StringBuilder buffer = new StringBuilder(bufferSize); - final StringBuilder pwords = new StringBuilder(1000); - buffer.append(document.dc_title().toLowerCase()); - buffer.append(document.dc_description().toLowerCase()); - buffer.append(document.dc_subject(' ').toLowerCase()); - final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); + //get words from document + final Map words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); + + // generate potential tags from document title, description and subject + final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; + final StringBuilder buffer = new StringBuilder(bufferSize); + final StringBuilder pwords = new StringBuilder(1000); + buffer.append(document.dc_title().toLowerCase()); + buffer.append(document.dc_description().toLowerCase()); + buffer.append(document.dc_subject(' ').toLowerCase()); + final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); + try { int score = 0; // get phrases @@ -163,44 +169,49 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle return document.getFileExtension(); } return clean; + } finally { + tokens.close(); } - return new String(); } private static TreeMap getPhrases(final Document document, final int size) { final TreeMap phrases = new TreeMap(); final StringBuilder phrase = new StringBuilder(128); - final Enumeration tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); - StringBuilder token; - int count = 0; - - // loop through text - while(tokens.hasMoreElements()) { - - token = tokens.nextElement(); - if(stopwords.contains(token.toString()) || isDigitSpace(token.toString())) - continue; + final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); + try { + StringBuilder token; + int count = 0; - // if we have a full phrase, delete the first token - count++; - if(count > size) - phrase.delete(0, phrase.indexOf(SPACE)+1); + // loop through text + while(tokens.hasMoreElements()) { + + token = tokens.nextElement(); + if(stopwords.contains(token.toString()) || isDigitSpace(token.toString())) + continue; + + // if we have a full phrase, delete the first token + count++; + if(count > size) + phrase.delete(0, phrase.indexOf(SPACE)+1); + + // append new token + if(phrase.length() > 1) + phrase.append(SPACE); + phrase.append(token); + + if(count >= size) { // make sure we really have a phrase + if(phrases.containsKey(phrase.toString())) { + phrases.get(phrase.toString()).inc(); + } else { + phrases.put(phrase.toString(), new YMarkTag(phrase.toString())); + } + } + } - // append new token - if(phrase.length() > 1) - phrase.append(SPACE); - phrase.append(token); - - if(count >= size) { // make sure we really have a phrase - if(phrases.containsKey(phrase.toString())) { - phrases.get(phrase.toString()).inc(); - } else { - phrases.put(phrase.toString(), new YMarkTag(phrase.toString())); - } - } + return phrases; + } finally { + tokens.close(); } - - return phrases; } public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap tags) { diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 424ba0845..7fbe98c49 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -235,21 +235,24 @@ public final class Condenser { if (text == null) return; String word; Word wprop; - WordTokenizer wordenum; - wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); - int pip = 0; - while (wordenum.hasMoreElements()) { - word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); - if (useForLanguageIdentification) this.languageIdentificator.add(word); - if (word.length() < 2) continue; - wprop = this.words.get(word); - if (wprop == null) wprop = new Word(0, pip, phrase); - if (wprop.flags == null) wprop.flags = flagstemplate.clone(); - wprop.flags.set(flagpos, true); - this.words.put(word, wprop); - pip++; - this.RESULT_NUMB_WORDS++; - this.RESULT_DIFF_WORDS++; + WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); + try { + int pip = 0; + while (wordenum.hasMoreElements()) { + word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); + if (useForLanguageIdentification) this.languageIdentificator.add(word); + if (word.length() < 2) continue; + wprop = this.words.get(word); + if (wprop == null) wprop = new Word(0, pip, phrase); + if (wprop.flags == null) wprop.flags = flagstemplate.clone(); + wprop.flags.set(flagpos, true); + this.words.put(word, wprop); + pip++; + this.RESULT_NUMB_WORDS++; + this.RESULT_DIFF_WORDS++; + } + } finally { + wordenum.close(); } } @@ -296,45 +299,49 @@ public final class Condenser { // read source final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); - while (wordenum.hasMoreElements()) { - word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); - if (this.languageIdentificator != null) this.languageIdentificator.add(word); - if (word.length() < wordminsize) continue; - - // distinguish punctuation and words - wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { - // store sentence - currsentwords.clear(); - wordInSentenceCounter = 1; - } else { - // check index.of detection - if (last_last && comb_indexof && word.equals("modified")) { - this.RESULT_FLAGS.set(flag_cat_indexof, true); - wordenum.pre(true); // parse lines as they come with CRLF - } - if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; - last_last = word.equals("last"); - last_index = word.equals("index"); - - // store word - allwordcounter++; - currsentwords.add(word); - wsp = this.words.get(word); - if (wsp != null) { - // word already exists - wordHandle = wsp.posInText; - wsp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = wordHandleCount++; - wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); - wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word, wsp); - } - // we now have the unique handle of the word, put it into the sentence: - wordInSentenceCounter++; - } + try { + while (wordenum.hasMoreElements()) { + word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + if (this.languageIdentificator != null) this.languageIdentificator.add(word); + if (word.length() < wordminsize) continue; + + // distinguish punctuation and words + wordlen = word.length(); + if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { + // store sentence + currsentwords.clear(); + wordInSentenceCounter = 1; + } else { + // check index.of detection + if (last_last && comb_indexof && word.equals("modified")) { + this.RESULT_FLAGS.set(flag_cat_indexof, true); + wordenum.pre(true); // parse lines as they come with CRLF + } + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; + last_last = word.equals("last"); + last_index = word.equals("index"); + + // store word + allwordcounter++; + currsentwords.add(word); + wsp = this.words.get(word); + if (wsp != null) { + // word already exists + wordHandle = wsp.posInText; + wsp.inc(); + } else { + // word does not yet exist, create new word entry + wordHandle = wordHandleCount++; + wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100); + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word, wsp); + } + // we now have the unique handle of the word, put it into the sentence: + wordInSentenceCounter++; + } + } + } finally { + wordenum.close(); } if (pseudostemming) { diff --git a/source/net/yacy/document/SentenceReader.java b/source/net/yacy/document/SentenceReader.java index fd7112a37..8f8a2fb37 100644 --- a/source/net/yacy/document/SentenceReader.java +++ b/source/net/yacy/document/SentenceReader.java @@ -141,4 +141,12 @@ public class SentenceReader implements Iterator { public void remove() { throw new UnsupportedOperationException(); } + + public void close() { + try { + raf.close(); + } catch(IOException ioe) { + // Ignore IO Exceptions + } + } } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index c7fa214cc..cd5cd2ca7 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -280,14 +280,21 @@ public final class TextParser { final HashMap failedParser = new HashMap(); if (MemoryControl.request(sourceArray.length * 6, false)) { for (final Parser parser: parsers) { + ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray); try { - docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); + docs = parser.parse(location, mimeType, documentCharset, bis); } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); } catch (final Exception e) { failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); + } finally { + try { + bis.close(); + } catch(IOException ioe) { + // Ignore. + } } if (docs != null) break; } diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 0ca592ade..664c4e8cc 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -79,6 +79,10 @@ public class WordTokenizer implements Enumeration { if (this.meaningLib != null) WordCache.learn(r); return r; } + + public void close() { + e.close(); + } private static class unsievedWordsEnum implements Enumeration { // returns an enumeration of StringBuilder Objects @@ -145,6 +149,9 @@ public class WordTokenizer implements Enumeration { return r; } + public void close() { + e.close(); + } } public static StringBuilder trim(final StringBuilder sb) { @@ -172,23 +179,27 @@ public class WordTokenizer implements Enumeration { */ public static SortedMap hashSentence(final String sentence, final WordCache meaningLib) { final SortedMap map = new TreeMap(Base64Order.enhancedCoder); - final Enumeration words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); - int pos = 0; - StringBuilder word; - byte[] hash; - Integer oldpos; - while (words.hasMoreElements()) { - word = words.nextElement(); - hash = Word.word2hash(word); - - // don't overwrite old values, that leads to too far word distances - oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); - if (oldpos != null) { - map.put(hash, oldpos); - } - - pos += word.length() + 1; + final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); + try { + int pos = 0; + StringBuilder word; + byte[] hash; + Integer oldpos; + while (words.hasMoreElements()) { + word = words.nextElement(); + hash = Word.word2hash(word); + + // don't overwrite old values, that leads to too far word distances + oldpos = map.put(hash, LargeNumberCache.valueOf(pos)); + if (oldpos != null) { + map.put(hash, oldpos); + } + + pos += word.length() + 1; + } + return map; + } finally { + words.close(); } - return map; } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 2931bec56..caf7bdd18 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -58,7 +58,8 @@ import net.yacy.kelondro.util.MemoryControl; public class ContentScraper extends AbstractScraper implements Scraper { - + private static final String EMPTY_STRING = new String(); + private final char degree = '\u00B0'; private final char[] minuteCharsHTML = "'".toCharArray(); @@ -159,7 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.iframes = new HashSet(); this.metas = new HashMap(); this.script = new HashSet(); - this.title = ""; + this.title = EMPTY_STRING; this.headlines = new ArrayList[6]; for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList(); this.bold = new ClusteredScoreMap(); @@ -318,14 +319,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { public void scrapeTag0(final String tagname, final Properties tagopts) { if (tagname.equalsIgnoreCase("img")) { - final String src = tagopts.getProperty("src", ""); + final String src = tagopts.getProperty("src", EMPTY_STRING); try { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); if (src.length() > 0) { final MultiProtocolURI url = absolutePath(src); if (url != null) { - final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1); + final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1); addImage(this.images, ie); } } @@ -333,47 +334,47 @@ public class ContentScraper extends AbstractScraper implements Scraper { this.evaluationScores.match(Element.imgpath, src); } else if(tagname.equalsIgnoreCase("base")) { try { - this.root = new MultiProtocolURI(tagopts.getProperty("href", "")); + this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING)); } catch (final MalformedURLException e) {} } else if (tagname.equalsIgnoreCase("frame")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); + final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); mergeAnchors(src, tagopts /* with property "name" */); this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true, false)); } else if (tagname.equalsIgnoreCase("body")) { - final String c = tagopts.getProperty("class", ""); + final String c = tagopts.getProperty("class", EMPTY_STRING); this.evaluationScores.match(Element.bodyclass, c); } else if (tagname.equalsIgnoreCase("div")) { - final String id = tagopts.getProperty("id", ""); + final String id = tagopts.getProperty("id", EMPTY_STRING); this.evaluationScores.match(Element.divid, id); } else if (tagname.equalsIgnoreCase("meta")) { - String name = tagopts.getProperty("name", ""); - final String content = tagopts.getProperty("content",""); + String name = tagopts.getProperty("name", EMPTY_STRING); + final String content = tagopts.getProperty("content", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); if (name.equals("generator")) { this.evaluationScores.match(Element.metagenerator, content); } } else { - name = tagopts.getProperty("http-equiv", ""); + name = tagopts.getProperty("http-equiv", EMPTY_STRING); if (name.length() > 0) { this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); } } } else if (tagname.equalsIgnoreCase("area")) { - final String areatitle = cleanLine(tagopts.getProperty("title","")); - //String alt = tagopts.getProperty("alt",""); - final String href = tagopts.getProperty("href", ""); + final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING)); + //String alt = tagopts.getProperty("alt",EMPTY_STRING); + final String href = tagopts.getProperty("href", EMPTY_STRING); tagopts.put("nme", areatitle); if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts); } else if (tagname.equalsIgnoreCase("link")) { - final String href = tagopts.getProperty("href", ""); + final String href = tagopts.getProperty("href", EMPTY_STRING); final MultiProtocolURI newLink = absolutePath(href); if (newLink != null) { - final String rel = tagopts.getProperty("rel", ""); - final String linktitle = tagopts.getProperty("title", ""); - final String type = tagopts.getProperty("type", ""); + final String rel = tagopts.getProperty("rel", EMPTY_STRING); + final String linktitle = tagopts.getProperty("title", EMPTY_STRING); + final String type = tagopts.getProperty("type", EMPTY_STRING); if (rel.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); @@ -394,11 +395,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } else if(tagname.equalsIgnoreCase("embed")) { - mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); + mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */); } else if(tagname.equalsIgnoreCase("param")) { - final String name = tagopts.getProperty("name", ""); + final String name = tagopts.getProperty("name", EMPTY_STRING); if (name.equalsIgnoreCase("movie")) { - mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */); + mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */); } } @@ -409,12 +410,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); if (tagname.equalsIgnoreCase("a") && text.length < 2048) { - final String href = tagopts.getProperty("href", ""); + final String href = tagopts.getProperty("href", EMPTY_STRING); MultiProtocolURI url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { final String f = url.getFile(); final int p = f.lastIndexOf('.'); - final String type = (p < 0) ? "" : f.substring(p + 1); + final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1); if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) { // special handling of such urls: put them to the image urls final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1); @@ -461,12 +462,12 @@ public class ContentScraper extends AbstractScraper implements Scraper { h = recursiveParse(text); if (h.length() > 0) this.li.add(h); } else if (tagname.equalsIgnoreCase("iframe")) { - final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); + final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING)); mergeAnchors(src, tagopts /* with property "name" */); this.iframes.add(src); this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false)); } else if (tagname.equalsIgnoreCase("script")) { - final String src = tagopts.getProperty("src", ""); + final String src = tagopts.getProperty("src", EMPTY_STRING); if (src.length() > 0) { this.script.add(absolutePath(src)); this.evaluationScores.match(Element.scriptpath, src); @@ -507,7 +508,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } private final static String cleanLine(final String s) { - if (!MemoryControl.request(s.length() * 2, false)) return ""; + if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING; final StringBuilder sb = new StringBuilder(s.length()); char l = ' '; char c; @@ -683,27 +684,27 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String getDescription() { String s = this.metas.get("description"); if (s == null) s = this.metas.get("dc.description"); - if (s == null) return ""; + if (s == null) return EMPTY_STRING; return s; } public String getContentType() { final String s = this.metas.get("content-type"); - if (s == null) return ""; + if (s == null) return EMPTY_STRING; return s; } public String getAuthor() { String s = this.metas.get("author"); if (s == null) s = this.metas.get("dc.creator"); - if (s == null) return ""; + if (s == null) return EMPTY_STRING; return s; } public String getPublisher() { String s = this.metas.get("copyright"); if (s == null) s = this.metas.get("dc.publisher"); - if (s == null) return ""; + if (s == null) return EMPTY_STRING; return s; } @@ -732,7 +733,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String[] getKeywords() { String s = this.metas.get("keywords"); if (s == null) s = this.metas.get("dc.description"); - if (s == null) s = ""; + if (s == null) s = EMPTY_STRING; if (s.length() == 0) { return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase()); } @@ -756,13 +757,13 @@ public class ContentScraper extends AbstractScraper implements Scraper { public String getRefreshPath() { String s = this.metas.get("refresh"); - if (s == null) return ""; + if (s == null) return EMPTY_STRING; final int pos = s.indexOf(';'); - if (pos < 0) return ""; + if (pos < 0) return EMPTY_STRING; s = s.substring(pos + 1); if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim(); - return ""; + return EMPTY_STRING; } // parse location diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index 8a6446a6a..ccebeed9a 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -81,11 +81,22 @@ public final class TransformerWriter extends Writer { final Scraper scraper, final Transformer transformer, final boolean passbyIfBinarySuspect + ) { + this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 1024); + } + + public TransformerWriter( + final OutputStream outStream, + final Charset charSet, + final Scraper scraper, + final Transformer transformer, + final boolean passbyIfBinarySuspect, + final int initialBufferSize ) { this.outStream = outStream; this.scraper = scraper; this.transformer = transformer; - this.buffer = new CharBuffer(1024); + this.buffer = new CharBuffer(initialBufferSize); this.filterTag = null; this.filterOpts = null; this.filterCont = null; @@ -540,6 +551,7 @@ public final class TransformerWriter extends Writer { final char[] filtered = filterSentence(this.buffer.getChars(), quotechar); if (this.out != null) this.out.write(filtered); } + this.buffer.close(); this.buffer = null; } final char[] finalized = filterFinalize(quotechar); @@ -550,6 +562,7 @@ public final class TransformerWriter extends Writer { } this.filterTag = null; this.filterOpts = null; + if (this.filterCont != null) this.filterCont.close(); this.filterCont = null; // if (scraper != null) {scraper.close(); scraper = null;} // if (transformer != null) {transformer.close(); transformer = null;} diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 69c9f078d..0f9b839f0 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -196,7 +196,7 @@ public class htmlParser extends AbstractParser implements Parser { // parsing the content final ContentScraper scraper = new ContentScraper(location); - final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); + final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, sourceStream.available()); try { FileUtils.copy(sourceStream, writer, c); } catch (final IOException e) { diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 403c4328f..b0b45d0e2 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -114,17 +114,20 @@ public class odtParser extends AbstractParser implements Parser { if (entryName.equals("content.xml")) { // create a writer for output - writer = new CharBuffer(); - - // extract data - final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - final SAXParser saxParser = saxParserFactory.newSAXParser(); - saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); - - // close readers and writers - zipFileEntryStream.close(); - writer.close(); - + writer = new CharBuffer((int)zipEntry.getSize()); + try { + // extract data + final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); + try { + final SAXParser saxParser = saxParserFactory.newSAXParser(); + saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); + } finally { + // close readers and writers + zipFileEntryStream.close(); + } + } finally { + writer.close(); + } } else if (entryName.equals("meta.xml")) { // meta.xml contains metadata about the document final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index a5de7f8e4..195b3f7f7 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -100,17 +100,21 @@ public class ooxmlParser extends AbstractParser implements Parser { || entryName.startsWith("xl/worksheets/sheet")) { // create a writer for output - writer = new CharBuffer(); - - // extract data - final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); - final SAXParser saxParser = saxParserFactory.newSAXParser(); - saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); - - // close readers and writers - zipFileEntryStream.close(); - writer.close(); - + writer = new CharBuffer((int)zipEntry.getSize()); + try { + // extract data + final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); + try { + final SAXParser saxParser = saxParserFactory.newSAXParser(); + saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); + + // close readers and writers + } finally { + zipFileEntryStream.close(); + } + } finally { + writer.close(); + } } else if (entryName.equals("docProps/core.xml")) { // meta.xml contains metadata about the document final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index c1c24433e..3504edbea 100755 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -898,8 +898,7 @@ public class ArrayStack implements BLOB { if (!i1.hasNext()) { if (i2.hasNext()) { HeapWriter.delete(f1); - if (f2.renameTo(newFile)) - return newFile; + if (f2.renameTo(newFile)) return newFile; return f2; } HeapWriter.delete(f1); @@ -907,8 +906,7 @@ public class ArrayStack implements BLOB { return null; } else if (!i2.hasNext()) { HeapWriter.delete(f2); - if (f1.renameTo(newFile)) - return newFile; + if (f1.renameTo(newFile)) return newFile; return f1; } assert i1.hasNext(); diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java index 61103ab01..0d8df7609 100644 --- a/source/net/yacy/kelondro/index/RowCollection.java +++ b/source/net/yacy/kelondro/index/RowCollection.java @@ -48,6 +48,8 @@ import net.yacy.kelondro.util.kelondroException; public class RowCollection implements Sortable, Iterable, Cloneable { + private static final byte[] EMPTY_CACHE = new byte[0]; + public static final long growfactorLarge100 = 140L; public static final long growfactorSmall100 = 120L; private static final int isortlimit = 20; @@ -77,7 +79,7 @@ public class RowCollection implements Sortable, Iterable, this.rowdef = rowdef; this.sortBound = 0; this.lastTimeWrote = System.currentTimeMillis(); - this.chunkcache = new byte[0]; + this.chunkcache = EMPTY_CACHE; this.chunkcount = 0; } diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index 919368968..494d56006 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -122,9 +122,9 @@ public final class CharBuffer extends Writer { return length; } - private void grow() { - int newsize = buffer.length * 2 + 1; - if (newsize < 32) newsize = 32; + private void grow(int minSize) { + int newsize = buffer.length + 1024; + if (newsize < minSize) newsize = minSize+1; char[] tmp = new char[newsize]; System.arraycopy(buffer, offset, tmp, 0, length); buffer = tmp; @@ -136,7 +136,7 @@ public final class CharBuffer extends Writer { } public void write(final char b) { - if (offset + length + 1 > buffer.length) grow(); + if (offset + length + 1 > buffer.length) grow(offset + length + 1); buffer[offset + length++] = b; } @@ -145,7 +145,7 @@ public final class CharBuffer extends Writer { } public void write(final char[] bb, final int of, final int le) { - while (offset + length + le > buffer.length) grow(); + if (offset + length + le > buffer.length) grow(offset + length + le); System.arraycopy(bb, of, buffer, offset + length, le); length += le; } @@ -476,7 +476,7 @@ public final class CharBuffer extends Writer { } public void close() throws IOException { - // TODO Auto-generated method stub + buffer = null; // assist with garbage collection } public void flush() throws IOException {