diff --git a/source/de/anomic/http/client/Client.java b/source/de/anomic/http/client/Client.java index 25c3eb63e..d51c72751 100644 --- a/source/de/anomic/http/client/Client.java +++ b/source/de/anomic/http/client/Client.java @@ -133,6 +133,7 @@ public class Client { conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); localHostConfiguration.setHost("127.0.0.1"); conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); + conManager.getParams().setReceiveBufferSize(16 * 1024 * 1024); // set this high to avoid storage in temporary files // only one retry apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 025cdab51..e884c71cd 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -47,7 +47,6 @@ import java.util.TreeSet; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.io.CachedFileOutputStream; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.FileUtils; @@ -63,7 +62,7 @@ public class Document { private final StringBuilder creator; // author or copyright private final List sections; // if present: more titles/headlines appearing in the document private final StringBuilder description; // an abstract, if present: short content description - private Object text; // the clear text, all that is visible + private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) private final HashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. @@ -104,12 +103,9 @@ public class Document { this.languages = languages; this.indexingDenied = indexingDenied; - if (text == null) try { - this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); - } catch (final IOException e) { - Log.logException(e); - this.text = new StringBuilder(); - } else { + if (text == null) + this.text = new ByteArrayOutputStream(); + else { this.text = text; } } @@ -234,9 +230,9 @@ dc_rights if (this.text instanceof File) { this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); } else if (this.text instanceof byte[]) { - this.textStream = new ByteArrayInputStream((byte[])this.text); - } else if (this.text instanceof CachedFileOutputStream) { - return ((CachedFileOutputStream)this.text).getContent(); + this.textStream = new ByteArrayInputStream((byte[]) this.text); + } else if (this.text instanceof ByteArrayOutputStream) { + this.textStream = new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray()); } return this.textStream; } catch (final Exception e) { @@ -253,12 +249,8 @@ dc_rights return FileUtils.read((File)this.text); } else if (this.text instanceof byte[]) { return (byte[])this.text; - } else if (this.text instanceof CachedFileOutputStream) { - final CachedFileOutputStream ffbaos = (CachedFileOutputStream)this.text; - if (ffbaos.isFallback()) { - return FileUtils.read(ffbaos.getContent()); - } - return ffbaos.getContentBAOS(); + } else if (this.text instanceof ByteArrayOutputStream) { + return ((ByteArrayOutputStream) this.text).toByteArray(); } } catch (final Exception e) { Log.logException(e); @@ -268,10 +260,10 @@ dc_rights public long getTextLength() { if (this.text == null) return 0; - if (this.text instanceof File) return ((File)this.text).length(); - else if (this.text instanceof byte[]) return ((byte[])this.text).length; - else if (this.text instanceof CachedFileOutputStream) { - return ((CachedFileOutputStream)this.text).getLength(); + if (this.text instanceof File) return ((File) this.text).length(); + else if (this.text instanceof byte[]) return ((byte[]) this.text).length; + else if (this.text instanceof ByteArrayOutputStream) { + return ((ByteArrayOutputStream)this.text).size(); } return -1; @@ -506,11 +498,10 @@ dc_rights if (this.description.length() > 0) this.description.append('\n'); this.description.append(doc.dc_description()); - if (!(this.text instanceof CachedFileOutputStream)) { - this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); - FileUtils.copy(getText(), (CachedFileOutputStream)this.text); + if (!(this.text instanceof ByteArrayOutputStream)) { + this.text = new ByteArrayOutputStream(); } - FileUtils.copy(doc.getText(), (CachedFileOutputStream)this.text); + FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text); anchors.putAll(doc.getAnchors()); ContentScraper.addAllImages(images, doc.getImages()); diff --git a/source/net/yacy/document/Idiom.java b/source/net/yacy/document/Idiom.java index 22afd1e34..6f8fc886b 100644 --- a/source/net/yacy/document/Idiom.java +++ b/source/net/yacy/document/Idiom.java @@ -39,10 +39,7 @@ import net.yacy.kelondro.data.meta.DigestURI; * @version $LastChangedRevision$ / $LastChangedDate$ */ public interface Idiom { - - public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024; - /** * Parsing a document available as byte array * @param location the origin of the document diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index cc3741b02..a2ce7e2b1 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -28,9 +28,7 @@ package net.yacy.document.parser; import java.io.File; -import java.io.FileOutputStream; import java.io.InputStream; -import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Enumeration; import java.util.HashSet; @@ -136,15 +134,9 @@ public class odtParser extends AbstractParser implements Idiom { // content.xml contains the document content in xml format if (entryName.equals("content.xml")) { - final long contentSize = zipEntry.getSize(); - // creating a writer for output - if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { - writerFile = File.createTempFile("odtParser",".prt"); - writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); - } else { - writer = new CharBuffer(); - } + // create a writer for output + writer = new CharBuffer(); // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 27c51fa1f..3df4cf818 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -28,9 +28,7 @@ package net.yacy.document.parser; import java.io.File; -import java.io.FileOutputStream; import java.io.InputStream; -import java.io.OutputStreamWriter; import java.io.Writer; import java.util.Enumeration; import java.util.HashSet; @@ -122,15 +120,9 @@ public class ooxmlParser extends AbstractParser implements Idiom { if (entryName.equals("word/document.xml") || entryName.startsWith("ppt/slides/slide") || entryName.startsWith("xl/worksheets/sheet")) { - final long contentSize = zipEntry.getSize(); - // creating a writer for output - if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { - writerFile = File.createTempFile("ooxmlParser",".prt"); - writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); - } else { - writer = new CharBuffer(); - } + // create a writer for output + writer = new CharBuffer(); // extract data final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 9c87c35b0..b74830ab6 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -28,10 +28,8 @@ package net.yacy.document.parser; import java.io.File; -import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; -import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; import java.io.Writer; import java.util.HashSet; @@ -136,13 +134,8 @@ public class pdfParser extends AbstractParser implements Idiom { Writer writer = null; File writerFile = null; try { - // creating a writer for output - if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { - writerFile = File.createTempFile("pdfParser",".prt"); - writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8"); - } else { - writer = new CharBuffer(); - } + // create a writer for output + writer = new CharBuffer(); final PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(theDocument, writer); // may throw a NPE theDocument.close(); diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 864c760b5..d9f80511b 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -28,6 +28,7 @@ package net.yacy.document.parser; import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.io.InputStream; @@ -41,7 +42,6 @@ import net.yacy.document.Idiom; import net.yacy.document.TextParser; import net.yacy.document.ParserException; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.io.CachedFileOutputStream; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -69,8 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { super("7zip Archive Parser"); } - public Document parse(final DigestURI location, final String mimeType, final String charset, - final IInStream source, final long maxRamSize) throws ParserException, InterruptedException { + public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException { final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false); Handler archive; super.theLogger.logFine("opening 7zip archive..."); @@ -81,7 +80,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { } checkInterruption(); final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive, - maxRamSize, doc, location.getFile()); + doc, location.getFile()); super.theLogger.logFine("processing archive contents..."); try { archive.Extract(null, -1, 0, aec); @@ -102,14 +101,14 @@ public class sevenzipParser extends AbstractParser implements Idiom { @Override public Document parse(final DigestURI location, final String mimeType, final String charset, final byte[] source) throws ParserException, InterruptedException { - return parse(location, mimeType, charset, new ByteArrayIInStream(source), Idiom.MAX_KEEP_IN_MEMORY_SIZE - source.length); + return parse(location, mimeType, charset, new ByteArrayIInStream(source)); } @Override public Document parse(final DigestURI location, final String mimeType, final String charset, final File sourceFile) throws ParserException, InterruptedException { try { - return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE); + return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r")); } catch (final IOException e) { throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); } @@ -118,12 +117,9 @@ public class sevenzipParser extends AbstractParser implements Idiom { public Document parse(final DigestURI location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { try { - final CachedFileOutputStream cfos = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); + final ByteArrayOutputStream cfos = new ByteArrayOutputStream(); FileUtils.copy(source, cfos); - if (cfos.isFallback()) { - return parse(location, mimeType, charset, cfos.getContentFile()); - } - return parse(location, mimeType, charset, cfos.getContentBAOS()); + return parse(location, mimeType, charset, new ByteArrayInputStream(cfos.toByteArray())); } catch (final IOException e) { throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); } @@ -143,16 +139,14 @@ public class sevenzipParser extends AbstractParser implements Idiom { public static class SZParserExtractCallback extends ArchiveExtractCallback { private final Log log; - private final long maxRamSize; - private CachedFileOutputStream cfos = null; + private ByteArrayOutputStream cfos = null; private final Document doc; private final String prefix; public SZParserExtractCallback(final Log logger, final IInArchive handler, - final long maxRamSize, final Document doc, final String prefix) { + final Document doc, final String prefix) { super.Init(handler); this.log = logger; - this.maxRamSize = maxRamSize; this.doc = doc; this.prefix = prefix; } @@ -197,11 +191,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { // below for reversion of the effects final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - if (this.cfos.isFallback()) { - theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentFile()); - } else { - theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentBAOS()); - } + theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); this.doc.addSubDocument(theDoc); } @@ -227,8 +217,7 @@ public class sevenzipParser extends AbstractParser implements Idiom { ex.initCause(e); throw ex; } - this.cfos = (item.isDirectory()) ? null - : new CachedFileOutputStream(this.maxRamSize, null, true, item.getSize()); + this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream(); return this.cfos; } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index a0f3793f1..024225463 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -27,9 +27,7 @@ package net.yacy.document.parser; -import java.io.BufferedOutputStream; import java.io.File; -import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; @@ -90,12 +88,7 @@ public class tarParser extends AbstractParser implements Idiom { File outputFile = null; Document subDoc = null; try { - if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { - outputFile = File.createTempFile("zipParser",".prt"); - docText = new BufferedOutputStream(new FileOutputStream(outputFile)); - } else { - docText = new ByteBuffer(); - } + docText = new ByteBuffer(); /* * If the mimeType was not reported correcly by the webserve we diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index f06beca6a..650afe9c3 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -27,9 +27,7 @@ package net.yacy.document.parser; -import java.io.BufferedOutputStream; import java.io.File; -import java.io.FileOutputStream; import java.io.InputStream; import java.io.OutputStream; import java.util.Arrays; @@ -91,12 +89,7 @@ public class zipParser extends AbstractParser implements Idiom { File outputFile = null; Document subDoc = null; try { - if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { - outputFile = File.createTempFile("zipParser",".prt"); - docText = new BufferedOutputStream(new FileOutputStream(outputFile)); - } else { - docText = new ByteBuffer(); - } + docText = new ByteBuffer(); final StringBuilder docKeywords = new StringBuilder(); final StringBuilder docLongTitle = new StringBuilder(); diff --git a/source/net/yacy/kelondro/io/CachedFileOutputStream.java b/source/net/yacy/kelondro/io/CachedFileOutputStream.java deleted file mode 100644 index c61f93f8c..000000000 --- a/source/net/yacy/kelondro/io/CachedFileOutputStream.java +++ /dev/null @@ -1,157 +0,0 @@ -// FileFallbackByteArrayOutputStream.java -// ------------------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@yacy.net -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// -// This file ist contributed by Franz Brausze -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package net.yacy.kelondro.io; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.ByteArrayInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; - -import net.yacy.kelondro.util.FileUtils; - - -public class CachedFileOutputStream extends ByteArrayOutputStream { - - protected File fallbackFile; - protected long fallbackSize; - protected boolean buffered; - - protected long size = 0; - protected boolean isFallback = false; - protected OutputStream fallback = null; - - public CachedFileOutputStream(final long fallbackSize) throws IOException { - this(fallbackSize, null, true, 32); - } - - public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered) - throws IOException { - this(fallbackSize, fallback, buffered, 32); - } - - public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, - final long size) throws IOException { - this.fallbackSize = fallbackSize; - this.fallbackFile = (fallback == null) ? File.createTempFile( - CachedFileOutputStream.class.getName(), - Long.toString(System.currentTimeMillis())) : fallback; - this.buffered = buffered; - checkFallback(size); - } - - public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered, - final byte[] data) throws IOException { - this(fallbackSize, fallback, buffered, 0); - super.buf = data; - super.count = data.length; - checkFallback(this.size = data.length); - } - - protected boolean checkFallback(final long size) { - if (size > this.fallbackSize) try { - fallback(); - return true; - } catch (final IOException e) { - throw new RuntimeException("error falling back to file", e); - } - return false; - } - - public void fallback() throws IOException { - if (this.isFallback) return; - this.isFallback = true; - if (!this.fallbackFile.exists()) { - this.fallbackFile.createNewFile(); - } else if (this.fallbackFile.isDirectory()) { - throw new IOException("cannot write on a directory"); - } - final OutputStream os = new FileOutputStream(this.fallbackFile); - this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os; - FileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback); - super.buf = new byte[0]; - super.count = 0; - super.reset(); - } - - public boolean isFallback() { - return this.isFallback; - } - - public synchronized void write(final int b) { - if (checkFallback(++this.size)) try { - this.fallback.write(b); - } catch (final IOException e) { - throw new RuntimeException("error writing to fallback", e); - } else { - super.write(b); - } - } - - public synchronized void write(final byte[] b, final int off, final int len) { - if (checkFallback(this.size += len)) try { - this.fallback.write(b, off, len); - } catch (final IOException e) { - throw new RuntimeException("error writing to fallback", e); - } else { - super.write(b, off, len); - } - } - - public void close() throws IOException { - if (this.fallback != null) - this.fallback.close(); - super.close(); - } - - public InputStream getContent() throws IOException { - close(); - if (this.isFallback) { - final InputStream is = new FileInputStream(this.fallbackFile); - return (this.buffered) ? new BufferedInputStream(is) : is; - } - return new ByteArrayInputStream(this.buf); - } - - public byte[] getContentBAOS() { - if (this.isFallback) - throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file"); - return super.buf; - } - - public File getContentFile() { - if (!this.isFallback) - throw new RuntimeException("haven't fallen back yet, fallback file has no content"); - return this.fallbackFile; - } - - public long getLength() { - return this.size; - } -}