From 4540174fe04f45fcd0dc4fd7e1ee43b3ce94ae86 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 2 Feb 2012 07:37:00 +0100 Subject: [PATCH] memory hacks --- source/de/anomic/data/BookmarkHelper.java | 2 +- .../anomic/http/server/HTTPDFileHandler.java | 10 +-- .../document/parser/html/ContentScraper.java | 17 +++-- .../parser/html/ContentTransformer.java | 7 +-- .../parser/html/ScraperInputStream.java | 63 ++++++++++--------- .../parser/html/TransformerWriter.java | 49 +++------------ .../net/yacy/document/parser/pdfParser.java | 10 +-- .../kelondro/data/meta/URIMetadataRow.java | 2 +- source/net/yacy/kelondro/io/CharBuffer.java | 25 ++++---- 9 files changed, 80 insertions(+), 105 deletions(-) diff --git a/source/de/anomic/data/BookmarkHelper.java b/source/de/anomic/data/BookmarkHelper.java index 6119978d2..3fa6d1fcd 100644 --- a/source/de/anomic/data/BookmarkHelper.java +++ b/source/de/anomic/data/BookmarkHelper.java @@ -143,7 +143,7 @@ public class BookmarkHelper { //load the links final ContentScraper scraper = new ContentScraper(baseURL); //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - final Writer writer= new TransformerWriter(null,null,scraper, null, false); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(input,writer); writer.close(); links = scraper.getAnchors(); diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 0ece1e9b0..5ec61e83b 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler { if (mimeType.startsWith("text")) { // every text-file distributed by yacy is UTF-8 - if(!path.startsWith("/repository")) { + if (!path.startsWith("/repository")) { mimeType = mimeType + "; charset=UTF-8"; } else { // detect charset of html-files - if((path.endsWith("html") || path.endsWith("htm"))) { + if ((path.endsWith("html") || path.endsWith("htm"))) { // save position fis.mark(1000); // scrape document to look up charset - final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false); + final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false); final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); - if(charset != null) - mimeType = mimeType + "; charset="+charset; + htmlFilter.close(); + if (charset != null) mimeType = mimeType + "; charset="+charset; // reset position fis.reset(); } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 07b22f6ab..a0a90e223 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -485,17 +485,24 @@ public class ContentScraper extends AbstractScraper implements Scraper { final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false); try { FileUtils.copy(new CharArrayReader(inlineHtml), writer); - writer.close(); } catch (final IOException e) { Log.logException(e); return cleanLine(super.stripAll(inlineHtml)); + } finally { + scraper.close(); + try { + writer.close(); + } catch (IOException e) { + } } for (final Map.Entry entry: scraper.getAnchors().entrySet()) { mergeAnchors(entry.getKey(), entry.getValue()); } this.images.putAll(scraper.images); - return cleanLine(super.stripAll(scraper.content.getChars())); + String line = cleanLine(super.stripAll(scraper.content.getChars())); + scraper.close(); + return line; } private final static String cleanLine(final String s) { @@ -885,14 +892,14 @@ public class ContentScraper extends AbstractScraper implements Scraper { // scrape document to look up charset final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false); String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset()); - if(charset == null) - charset = Charset.defaultCharset().toString(); + htmlFilter.close(); + if (charset == null) charset = Charset.defaultCharset().toString(); // scrape content final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost")); final Writer writer = new TransformerWriter(null, null, scraper, null, false); FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset)); - + writer.close(); return scraper; } diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java index c6d97bea4..ce4679676 100644 --- a/source/net/yacy/document/parser/html/ContentTransformer.java +++ b/source/net/yacy/document/parser/html/ContentTransformer.java @@ -34,7 +34,6 @@ import java.util.TreeSet; import net.yacy.cora.document.ASCII; import net.yacy.kelondro.io.CharBuffer; -import net.yacy.kelondro.logging.Log; public class ContentTransformer extends AbstractTransformer implements Transformer { @@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform } bb.append(" "); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (IOException e) { - Log.logException(e); - } + bb.close(); return result; } diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java index 6cdc6086a..8c3fa454d 100644 --- a/source/net/yacy/document/parser/html/ScraperInputStream.java +++ b/source/net/yacy/document/parser/html/ScraperInputStream.java @@ -9,7 +9,7 @@ // $LastChangedBy$ // // LICENSE -// +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -39,11 +39,11 @@ import net.yacy.cora.document.MultiProtocolURI; public class ScraperInputStream extends InputStream implements ScraperListener { - + private static final int MODE_PRESCAN = 0; private static final int MODE_PRESCAN_FINISHED = 1; private int mode = 1; - + private static final long preBufferSize = 4096; private long preRead = 0; private final BufferedInputStream bufferedIn; @@ -51,10 +51,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener { private String detectedCharset; private boolean charsetChanged = false; private boolean endOfHead = false; - + private Reader reader; private Writer writer; - + public ScraperInputStream( final InputStream inStream, final String inputStreamCharset, @@ -65,10 +65,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener { // create a input stream for buffereing this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize); this.bufferedIn.mark((int) preBufferSize); - + final ContentScraper scraper = new ContentScraper(rooturl); scraper.registerHtmlFilterEventListener(this); - + try { this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset); } catch (UnsupportedEncodingException e) { @@ -78,17 +78,17 @@ public class ScraperInputStream extends InputStream implements ScraperListener { // how is that possible? this.reader = new InputStreamReader(this); } - } + } this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect); } private static String extractCharsetFromMimetypeHeader(final String mimeType) { if (mimeType == null) return null; - + final String[] parts = mimeType.split(";"); if (parts == null || parts.length <= 1) return null; - - for (int i=1; i < parts.length; i++) { + + for (int i=1; i < parts.length; i++) { final String param = parts[i].trim(); if (param.startsWith("charset=")) { String charset = param.substring("charset=".length()).trim(); @@ -97,13 +97,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener { return charset.trim(); } } - - return null; + + return null; } + @Override public void scrapeTag0(final String tagname, final Properties tagopts) { if (tagname == null || tagname.length() == 0) return; - + if (tagname.equalsIgnoreCase("meta")) { if (tagopts.containsKey("http-equiv")) { final String value = tagopts.getProperty("http-equiv"); @@ -113,7 +114,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener { this.detectedCharset = extractCharsetFromMimetypeHeader(contentType); if (this.detectedCharset != null && this.detectedCharset.length() > 0) { this.charsetChanged = true; - } else if (tagopts.containsKey("charset")) { + } else if (tagopts.containsKey("charset")) { // sometimes the charset property is configured as extra attribut. try it ... this.detectedCharset = tagopts.getProperty("charset"); this.charsetChanged = true; @@ -123,48 +124,54 @@ public class ScraperInputStream extends InputStream implements ScraperListener { } } + @Override public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { if (tagname == null || tagname.length() == 0) return; - + if (tagname.equalsIgnoreCase("head")) { this.endOfHead = true; } } - + public String detectCharset() throws IOException { - this.mode = MODE_PRESCAN; - + this.mode = MODE_PRESCAN; + // loop until we have detected the header element or the charset data int c; while ((c = this.reader.read())!= -1) { this.writer.write(c); if (this.charsetChanged) break; // thats enough } - + // free writer - this.writer = null; - // don't close writer here, otherwise it will shutdown our source stream + this.writer = null; + // don't close writer here, otherwise it will shutdown our source stream // reset the buffer if not already done if (this.mode != MODE_PRESCAN_FINISHED) { this.mode++; this.bufferedIn.reset(); } - + // return scanning result return (this.charsetChanged) ? this.detectedCharset : null; } + @Override public int read() throws IOException { // mode 0 is called from within the detectCharset function - if (this.mode == MODE_PRESCAN) { + if (this.mode == MODE_PRESCAN) { if (this.endOfHead || this.charsetChanged || this.preRead >= preBufferSize - 1) { - return -1; + return -1; } - this.preRead++; - } + this.preRead++; + } return this.bufferedIn.read(); } - + @Override + public void close() throws IOException { + if (this.writer != null) this.writer.close(); + } + } diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java index e6dfe9c75..eff602c55 100644 --- a/source/net/yacy/document/parser/html/TransformerWriter.java +++ b/source/net/yacy/document/parser/html/TransformerWriter.java @@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer { } bb.append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer { bb.append(text); bb.append('<').append('/').append(tagname).append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer { } bb.append('>'); final char[] result = bb.getChars(); - try { - bb.close(); - } catch (final IOException e) { - Log.logException(e); - } + bb.close(); return result; } @@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer { final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append('<').append('/').append(tagname).append('>'); final char[] result = cb.getChars(); - try { - cb.close(); - } catch (final IOException e) { - Log.logException(e); - } + cb.close(); return result; } @@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer { result = bb.getChars(1); else result = bb.getChars(); - try { - bb.close(); - } catch (final IOException ex) { - Log.logException(ex); - } + bb.close(); return result; } @@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer { // this single tag is collected at once here final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.scraper.scrapeTag0(tag, charBuffer.propParser()); - try { - charBuffer.close(); - } catch (final IOException e) { - // TODO Auto-generated catch block - Log.logException(e); - } + charBuffer.close(); } if ((this.transformer != null) && (this.transformer.isTag0(tag))) { // this single tag is collected at once here @@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer { try { return this.transformer.transformTag0(tag, scb.propParser(), quotechar); } finally { - try { - scb.close(); - } catch (final IOException e) { - Log.logException(e); - } + scb.close(); } } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) || ((this.transformer != null) && (this.transformer.isTag1(tag)))) { @@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer { this.filterTag = tag; final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); this.filterOpts = scb.propParser(); - try { - scb.close(); - } catch (final IOException e) { - Log.logException(e); - } + scb.close(); if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset(); return new char[0]; } else { diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index e8b01edc3..3d5f93410 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -144,14 +144,13 @@ public class pdfParser extends AbstractParser implements Parser { try { writer.append(stripper.getText(pdfDoc)); } catch (final Throwable e) {} - } - }; + } + }; t.start(); t.join(3000); if (t.isAlive()) t.interrupt(); pdfDoc.close(); - contentBytes = writer.getBytes(); // get final text before closing writer - writer.close(); + contentBytes = writer.getBytes(); // get final text before closing writer } catch (final IOException e) { // close the writer if (writer != null) try { writer.close(); } catch (final Exception ex) {} @@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser { //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final IOException e) {} + writer.close(); } String[] docKeywords = null; @@ -175,7 +175,7 @@ public class pdfParser extends AbstractParser implements Parser { if (docTitle == null) { docTitle = docSubject; } - + // clear resources in pdfbox. they say that is resolved but it's not. see: // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index be805dbeb..dbef4fe98 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata { final String dc_publisher, final float lat, final float lon) { - final CharBuffer s = new CharBuffer(20000, 360); + final CharBuffer s = new CharBuffer(3600, 360); s.append(url.toNormalform(false, true)).appendLF(); s.append(dc_title).appendLF(); if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator); diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java index f072c0315..e00f290f6 100644 --- a/source/net/yacy/kelondro/io/CharBuffer.java +++ b/source/net/yacy/kelondro/io/CharBuffer.java @@ -130,7 +130,7 @@ public final class CharBuffer extends Writer { } private void grow(int minSize) { - int newsize = 2 * Math.max(this.buffer.length, minSize); + int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20% char[] tmp = new char[newsize]; System.arraycopy(this.buffer, this.offset, tmp, 0, this.length); this.buffer = tmp; @@ -478,15 +478,12 @@ public final class CharBuffer extends Writer { this.offset = 0; } - public void reset(final int newSize) { - this.resize(newSize); - this.reset(); - } - - public void resize(final int newSize) { - if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize); - final char[] v = new char[newSize]; - System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize); + /** + * call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently + */ + public void trimToSize() { + final char[] v = new char[this.length]; + System.arraycopy(this.buffer, this.offset, v, 0, this.length); this.buffer = v; } @@ -497,13 +494,15 @@ public final class CharBuffer extends Writer { } @Override - public void close() throws IOException { + public void close() { + this.length = 0; + this.offset = 0; this.buffer = null; // assist with garbage collection } @Override - public void flush() throws IOException { - // TODO Auto-generated method stub + public void flush() { + trimToSize(); } } \ No newline at end of file