From 4540174fe04f45fcd0dc4fd7e1ee43b3ce94ae86 Mon Sep 17 00:00:00 2001
From: Michael Peter Christen <mc@yacy.net>
Date: Thu, 2 Feb 2012 07:37:00 +0100
Subject: [PATCH] memory hacks

---
 source/de/anomic/data/BookmarkHelper.java     |  2 +-
 .../anomic/http/server/HTTPDFileHandler.java  | 10 +--
 .../document/parser/html/ContentScraper.java  | 17 +++--
 .../parser/html/ContentTransformer.java       |  7 +--
 .../parser/html/ScraperInputStream.java       | 63 ++++++++++---------
 .../parser/html/TransformerWriter.java        | 49 +++------------
 .../net/yacy/document/parser/pdfParser.java   | 10 +--
 .../kelondro/data/meta/URIMetadataRow.java    |  2 +-
 source/net/yacy/kelondro/io/CharBuffer.java   | 25 ++++----
 9 files changed, 80 insertions(+), 105 deletions(-)

diff --git a/source/de/anomic/data/BookmarkHelper.java b/source/de/anomic/data/BookmarkHelper.java
index 6119978d2..3fa6d1fcd 100644
--- a/source/de/anomic/data/BookmarkHelper.java
+++ b/source/de/anomic/data/BookmarkHelper.java
@@ -143,7 +143,7 @@ public class BookmarkHelper {
             //load the links
             final ContentScraper scraper = new ContentScraper(baseURL);
             //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
-            final Writer writer= new TransformerWriter(null,null,scraper, null, false);
+            final Writer writer = new TransformerWriter(null, null, scraper, null, false);
             FileUtils.copy(input,writer);
             writer.close();
             links = scraper.getAnchors();
diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java
index 0ece1e9b0..5ec61e83b 100644
--- a/source/de/anomic/http/server/HTTPDFileHandler.java
+++ b/source/de/anomic/http/server/HTTPDFileHandler.java
@@ -1039,18 +1039,18 @@ public final class HTTPDFileHandler {
 
                     if (mimeType.startsWith("text")) {
                         // every text-file distributed by yacy is UTF-8
-                        if(!path.startsWith("/repository")) {
+                        if (!path.startsWith("/repository")) {
                             mimeType = mimeType + "; charset=UTF-8";
                         } else {
                             // detect charset of html-files
-                            if((path.endsWith("html") || path.endsWith("htm"))) {
+                            if ((path.endsWith("html") || path.endsWith("htm"))) {
                                 // save position
                                 fis.mark(1000);
                                 // scrape document to look up charset
-                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis,"UTF-8",new DigestURI("http://localhost"),null,false);
+                                final ScraperInputStream htmlFilter = new ScraperInputStream(fis, "UTF-8", new DigestURI("http://localhost"), null, false);
                                 final String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
-                                if(charset != null)
-                                    mimeType = mimeType + "; charset="+charset;
+                                htmlFilter.close();
+                                if (charset != null) mimeType = mimeType + "; charset="+charset;
                                 // reset position
                                 fis.reset();
                             }
diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java
index 07b22f6ab..a0a90e223 100644
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@@ -485,17 +485,24 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
         try {
             FileUtils.copy(new CharArrayReader(inlineHtml), writer);
-            writer.close();
         } catch (final IOException e) {
             Log.logException(e);
             return cleanLine(super.stripAll(inlineHtml));
+        } finally {
+            scraper.close();
+            try {
+                writer.close();
+            } catch (IOException e) {
+            }
         }
         for (final Map.Entry<MultiProtocolURI, Properties> entry: scraper.getAnchors().entrySet()) {
             mergeAnchors(entry.getKey(), entry.getValue());
         }
         this.images.putAll(scraper.images);
 
-        return cleanLine(super.stripAll(scraper.content.getChars()));
+        String line = cleanLine(super.stripAll(scraper.content.getChars()));
+        scraper.close();
+        return line;
     }
 
     private final static String cleanLine(final String s) {
@@ -885,14 +892,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
         // scrape document to look up charset
         final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
         String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
-        if(charset == null)
-               charset = Charset.defaultCharset().toString();
+        htmlFilter.close();
+        if (charset == null) charset = Charset.defaultCharset().toString();
 
         // scrape content
         final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
         final Writer writer = new TransformerWriter(null, null, scraper, null, false);
         FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
-
+        writer.close();
         return scraper;
     }
 
diff --git a/source/net/yacy/document/parser/html/ContentTransformer.java b/source/net/yacy/document/parser/html/ContentTransformer.java
index c6d97bea4..ce4679676 100644
--- a/source/net/yacy/document/parser/html/ContentTransformer.java
+++ b/source/net/yacy/document/parser/html/ContentTransformer.java
@@ -34,7 +34,6 @@ import java.util.TreeSet;
 
 import net.yacy.cora.document.ASCII;
 import net.yacy.kelondro.io.CharBuffer;
-import net.yacy.kelondro.logging.Log;
 
 public class ContentTransformer extends AbstractTransformer implements Transformer {
 
@@ -90,11 +89,7 @@ public class ContentTransformer extends AbstractTransformer implements Transform
             }
             bb.append("</FONT> ");
             final char[] result = bb.getChars();
-            try {
-				bb.close();
-			} catch (IOException e) {
-			    Log.logException(e);
-			}
+            bb.close();
             return result;
     }
 
diff --git a/source/net/yacy/document/parser/html/ScraperInputStream.java b/source/net/yacy/document/parser/html/ScraperInputStream.java
index 6cdc6086a..8c3fa454d 100644
--- a/source/net/yacy/document/parser/html/ScraperInputStream.java
+++ b/source/net/yacy/document/parser/html/ScraperInputStream.java
@@ -9,7 +9,7 @@
 // $LastChangedBy$
 //
 // LICENSE
-// 
+//
 // This program is free software; you can redistribute it and/or modify
 // it under the terms of the GNU General Public License as published by
 // the Free Software Foundation; either version 2 of the License, or
@@ -39,11 +39,11 @@ import net.yacy.cora.document.MultiProtocolURI;
 
 
 public class ScraperInputStream extends InputStream implements ScraperListener {
-    
+
     private static final int MODE_PRESCAN = 0;
     private static final int MODE_PRESCAN_FINISHED = 1;
     private int mode = 1;
-    
+
     private static final long preBufferSize = 4096;
     private long preRead = 0;
     private final BufferedInputStream bufferedIn;
@@ -51,10 +51,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
     private String detectedCharset;
     private boolean charsetChanged = false;
     private boolean endOfHead = false;
-    
+
     private Reader reader;
     private Writer writer;
-    
+
     public ScraperInputStream(
             final InputStream inStream,
             final String inputStreamCharset,
@@ -65,10 +65,10 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
         // create a input stream for buffereing
         this.bufferedIn = new BufferedInputStream(inStream, (int) preBufferSize);
         this.bufferedIn.mark((int) preBufferSize);
-        
+
         final ContentScraper scraper = new ContentScraper(rooturl);
         scraper.registerHtmlFilterEventListener(this);
-        
+
         try {
 	    this.reader = (inputStreamCharset == null) ? new InputStreamReader(this) : new InputStreamReader(this,inputStreamCharset);
 	} catch (UnsupportedEncodingException e) {
@@ -78,17 +78,17 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
 		// how is that possible?
 		this.reader = new InputStreamReader(this);
 	    }
-	} 
+	}
         this.writer = new TransformerWriter(null,null,scraper,transformer,passbyIfBinarySuspect);
     }
 
     private static String extractCharsetFromMimetypeHeader(final String mimeType) {
         if (mimeType == null) return null;
-        
+
         final String[] parts = mimeType.split(";");
         if (parts == null || parts.length <= 1) return null;
-        
-        for (int i=1; i < parts.length; i++) {    
+
+        for (int i=1; i < parts.length; i++) {
             final String param = parts[i].trim();
             if (param.startsWith("charset=")) {
                 String charset = param.substring("charset=".length()).trim();
@@ -97,13 +97,14 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
                 return charset.trim();
             }
         }
-        
-        return null;            
+
+        return null;
     }
 
+    @Override
     public void scrapeTag0(final String tagname, final Properties tagopts) {
         if (tagname == null || tagname.length() == 0) return;
-        
+
         if (tagname.equalsIgnoreCase("meta")) {
             if (tagopts.containsKey("http-equiv")) {
                 final String value = tagopts.getProperty("http-equiv");
@@ -113,7 +114,7 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
                     this.detectedCharset = extractCharsetFromMimetypeHeader(contentType);
                     if (this.detectedCharset != null && this.detectedCharset.length() > 0) {
                         this.charsetChanged = true;
-                    } else if (tagopts.containsKey("charset")) { 
+                    } else if (tagopts.containsKey("charset")) {
                         // sometimes the charset property is configured as extra attribut. try it ...
                         this.detectedCharset = tagopts.getProperty("charset");
                         this.charsetChanged = true;
@@ -123,48 +124,54 @@ public class ScraperInputStream extends InputStream implements ScraperListener {
         }
     }
 
+    @Override
     public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
         if (tagname == null || tagname.length() == 0) return;
-        
+
         if (tagname.equalsIgnoreCase("head")) {
             this.endOfHead = true;
         }
     }
-    
+
     public String detectCharset() throws IOException {
-        this.mode = MODE_PRESCAN; 
-        
+        this.mode = MODE_PRESCAN;
+
         // loop until we have detected the header element or the charset data
         int c;
         while ((c = this.reader.read())!= -1) {
             this.writer.write(c);
             if (this.charsetChanged) break; // thats enough
         }
-        
+
         // free writer
-        this.writer = null;        
-        // don't close writer here, otherwise it will shutdown our source stream 
+        this.writer = null;
+        // don't close writer here, otherwise it will shutdown our source stream
 
         // reset the buffer if not already done
         if (this.mode != MODE_PRESCAN_FINISHED) {
             this.mode++;
             this.bufferedIn.reset();
         }
-        
+
         // return scanning result
         return (this.charsetChanged) ? this.detectedCharset : null;
     }
 
+    @Override
     public int read() throws IOException {
         // mode 0 is called from within the detectCharset function
-        if (this.mode == MODE_PRESCAN) {      
+        if (this.mode == MODE_PRESCAN) {
             if (this.endOfHead || this.charsetChanged || this.preRead >= preBufferSize - 1) {
-                return -1;            
+                return -1;
             }
-            this.preRead++;            
-        }        
+            this.preRead++;
+        }
         return this.bufferedIn.read();
     }
 
-    
+    @Override
+    public void close() throws IOException {
+        if (this.writer != null) this.writer.close();
+    }
+
 }
diff --git a/source/net/yacy/document/parser/html/TransformerWriter.java b/source/net/yacy/document/parser/html/TransformerWriter.java
index e6dfe9c75..eff602c55 100644
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@@ -127,11 +127,7 @@ public final class TransformerWriter extends Writer {
             }
             bb.append('>');
             final char[] result = bb.getChars();
-            try {
-                bb.close();
-            } catch (final IOException e) {
-                Log.logException(e);
-            }
+            bb.close();
             return result;
     }
 
@@ -147,11 +143,7 @@ public final class TransformerWriter extends Writer {
             bb.append(text);
             bb.append('<').append('/').append(tagname).append('>');
             final char[] result = bb.getChars();
-            try {
-                bb.close();
-            } catch (final IOException e) {
-                Log.logException(e);
-            }
+            bb.close();
             return result;
     }
 
@@ -165,11 +157,7 @@ public final class TransformerWriter extends Writer {
             }
             bb.append('>');
             final char[] result = bb.getChars();
-            try {
-                bb.close();
-            } catch (final IOException e) {
-                Log.logException(e);
-            }
+            bb.close();
             return result;
     }
 
@@ -178,11 +166,7 @@ public final class TransformerWriter extends Writer {
             final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3);
             cb.append(text).append('<').append('/').append(tagname).append('>');
             final char[] result = cb.getChars();
-            try {
-                cb.close();
-            } catch (final IOException e) {
-                Log.logException(e);
-            }
+            cb.close();
             return result;
     }
 
@@ -202,11 +186,7 @@ public final class TransformerWriter extends Writer {
                 result = bb.getChars(1);
             else
                 result = bb.getChars();
-            try {
-                bb.close();
-            } catch (final IOException ex) {
-                Log.logException(ex);
-            }
+            bb.close();
             return result;
     }
 
@@ -227,12 +207,7 @@ public final class TransformerWriter extends Writer {
                     // this single tag is collected at once here
                     final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
                     this.scraper.scrapeTag0(tag, charBuffer.propParser());
-                    try {
-                        charBuffer.close();
-                    } catch (final IOException e) {
-                        // TODO Auto-generated catch block
-                        Log.logException(e);
-                    }
+                    charBuffer.close();
                 }
                 if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
                     // this single tag is collected at once here
@@ -240,11 +215,7 @@ public final class TransformerWriter extends Writer {
                     try {
                         return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
                     } finally {
-                        try {
-                            scb.close();
-                        } catch (final IOException e) {
-                            Log.logException(e);
-                        }
+                        scb.close();
                     }
                 } else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
                            ((this.transformer != null) && (this.transformer.isTag1(tag)))) {
@@ -252,11 +223,7 @@ public final class TransformerWriter extends Writer {
                     this.filterTag = tag;
                     final CharBuffer scb = new CharBuffer(ContentScraper.MAX_DOCSIZE, content);
                     this.filterOpts = scb.propParser();
-                    try {
-                        scb.close();
-                    } catch (final IOException e) {
-                        Log.logException(e);
-                    }
+                    scb.close();
                     if (this.filterCont == null) this.filterCont = new CharBuffer(ContentScraper.MAX_DOCSIZE, Math.max(100, content.length)); else this.filterCont.reset();
                     return new char[0];
                 } else {
diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java
index e8b01edc3..3d5f93410 100644
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@@ -144,14 +144,13 @@ public class pdfParser extends AbstractParser implements Parser {
                     try {
                         writer.append(stripper.getText(pdfDoc));
                     } catch (final Throwable e) {}
-                } 
-            }; 
+                }
+            };
             t.start();
             t.join(3000);
             if (t.isAlive()) t.interrupt();
             pdfDoc.close();
-            contentBytes = writer.getBytes(); // get final text before closing writer
-            writer.close();
+            contentBytes = writer.getBytes(); // get final text before closing writer
         } catch (final IOException e) {
             // close the writer
             if (writer != null) try { writer.close(); } catch (final Exception ex) {}
@@ -166,6 +165,7 @@ public class pdfParser extends AbstractParser implements Parser {
             //throw new Parser.Failure(e.getMessage(), location);
         } finally {
             try {pdfDoc.close();} catch (final IOException e) {}
+            writer.close();
         }
 
         String[] docKeywords = null;
@@ -175,7 +175,7 @@ public class pdfParser extends AbstractParser implements Parser {
         if (docTitle == null) {
             docTitle = docSubject;
         }
-        
+
         // clear resources in pdfbox. they say that is resolved but it's not. see:
         // https://issues.apache.org/jira/browse/PDFBOX-313
         // https://issues.apache.org/jira/browse/PDFBOX-351
diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
index be805dbeb..dbef4fe98 100644
--- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
+++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java
@@ -189,7 +189,7 @@ public class URIMetadataRow implements URIMetadata {
             final String dc_publisher,
             final float lat,
             final float lon) {
-        final CharBuffer s = new CharBuffer(20000, 360);
+        final CharBuffer s = new CharBuffer(3600, 360);
         s.append(url.toNormalform(false, true)).appendLF();
         s.append(dc_title).appendLF();
         if (dc_creator.length() > 80) s.append(dc_creator, 0, 80); else s.append(dc_creator);
diff --git a/source/net/yacy/kelondro/io/CharBuffer.java b/source/net/yacy/kelondro/io/CharBuffer.java
index f072c0315..e00f290f6 100644
--- a/source/net/yacy/kelondro/io/CharBuffer.java
+++ b/source/net/yacy/kelondro/io/CharBuffer.java
@@ -130,7 +130,7 @@ public final class CharBuffer extends Writer {
     }
 
     private void grow(int minSize) {
-        int newsize = 2 * Math.max(this.buffer.length, minSize);
+        int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
         char[] tmp = new char[newsize];
         System.arraycopy(this.buffer, this.offset, tmp, 0, this.length);
         this.buffer = tmp;
@@ -478,15 +478,12 @@ public final class CharBuffer extends Writer {
         this.offset = 0;
     }
 
-    public void reset(final int newSize) {
-        this.resize(newSize);
-        this.reset();
-    }
-
-    public void resize(final int newSize) {
-        if(newSize < 0) throw new IllegalArgumentException("Illegal array size: " + newSize);
-        final char[] v = new char[newSize];
-        System.arraycopy(this.buffer,0,v,0,newSize > this.buffer.length ? this.buffer.length : newSize);
+    /**
+     * call trimToSize() whenever a CharBuffer is not extended any more and is kept to store the content permanently
+     */
+    public void trimToSize() {
+        final char[] v = new char[this.length];
+        System.arraycopy(this.buffer, this.offset, v, 0, this.length);
         this.buffer = v;
     }
 
@@ -497,13 +494,15 @@ public final class CharBuffer extends Writer {
     }
 
     @Override
-    public void close() throws IOException {
+    public void close() {
+        this.length = 0;
+        this.offset = 0;
     	this.buffer = null; // assist with garbage collection
     }
 
     @Override
-    public void flush() throws IOException {
-        // TODO Auto-generated method stub
+    public void flush() {
+        trimToSize();
     }
 
 }
\ No newline at end of file