removed usage of temporary files: causes too much IO

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6813 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 48b9371735
commit f204076d25

@ -133,6 +133,7 @@ public class Client {
conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
localHostConfiguration.setHost("127.0.0.1"); localHostConfiguration.setHost("127.0.0.1");
conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100); conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
conManager.getParams().setReceiveBufferSize(16 * 1024 * 1024); // set this high to avoid storage in temporary files
// only one retry // only one retry
apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,

@ -47,7 +47,6 @@ import java.util.TreeSet;
import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -63,7 +62,7 @@ public class Document {
private final StringBuilder creator; // author or copyright private final StringBuilder creator; // author or copyright
private final List<String> sections; // if present: more titles/headlines appearing in the document private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible private Object text; // the clear text, all that is visible
private final Map<DigestURI, String> anchors; // all links embedded as clickeable entities (anchor tags) private final Map<DigestURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final HashMap<String, ImageEntry> images; // all visible pictures in document private final HashMap<String, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings. // the anchors and images - Maps are URL-to-EntityDescription mappings.
@ -104,12 +103,9 @@ public class Document {
this.languages = languages; this.languages = languages;
this.indexingDenied = indexingDenied; this.indexingDenied = indexingDenied;
if (text == null) try { if (text == null)
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); this.text = new ByteArrayOutputStream();
} catch (final IOException e) { else {
Log.logException(e);
this.text = new StringBuilder();
} else {
this.text = text; this.text = text;
} }
} }
@ -234,9 +230,9 @@ dc_rights
if (this.text instanceof File) { if (this.text instanceof File) {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text); this.textStream = new ByteArrayInputStream((byte[]) this.text);
} else if (this.text instanceof CachedFileOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
return ((CachedFileOutputStream)this.text).getContent(); this.textStream = new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
} }
return this.textStream; return this.textStream;
} catch (final Exception e) { } catch (final Exception e) {
@ -253,12 +249,8 @@ dc_rights
return FileUtils.read((File)this.text); return FileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) { } else if (this.text instanceof byte[]) {
return (byte[])this.text; return (byte[])this.text;
} else if (this.text instanceof CachedFileOutputStream) { } else if (this.text instanceof ByteArrayOutputStream) {
final CachedFileOutputStream ffbaos = (CachedFileOutputStream)this.text; return ((ByteArrayOutputStream) this.text).toByteArray();
if (ffbaos.isFallback()) {
return FileUtils.read(ffbaos.getContent());
}
return ffbaos.getContentBAOS();
} }
} catch (final Exception e) { } catch (final Exception e) {
Log.logException(e); Log.logException(e);
@ -268,10 +260,10 @@ dc_rights
public long getTextLength() { public long getTextLength() {
if (this.text == null) return 0; if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length(); if (this.text instanceof File) return ((File) this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length; else if (this.text instanceof byte[]) return ((byte[]) this.text).length;
else if (this.text instanceof CachedFileOutputStream) { else if (this.text instanceof ByteArrayOutputStream) {
return ((CachedFileOutputStream)this.text).getLength(); return ((ByteArrayOutputStream)this.text).size();
} }
return -1; return -1;
@ -506,11 +498,10 @@ dc_rights
if (this.description.length() > 0) this.description.append('\n'); if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description()); this.description.append(doc.dc_description());
if (!(this.text instanceof CachedFileOutputStream)) { if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); this.text = new ByteArrayOutputStream();
FileUtils.copy(getText(), (CachedFileOutputStream)this.text);
} }
FileUtils.copy(doc.getText(), (CachedFileOutputStream)this.text); FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
anchors.putAll(doc.getAnchors()); anchors.putAll(doc.getAnchors());
ContentScraper.addAllImages(images, doc.getImages()); ContentScraper.addAllImages(images, doc.getImages());

@ -39,10 +39,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
* @version $LastChangedRevision$ / $LastChangedDate$ * @version $LastChangedRevision$ / $LastChangedDate$
*/ */
public interface Idiom { public interface Idiom {
public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
/** /**
* Parsing a document available as byte array * Parsing a document available as byte array
* @param location the origin of the document * @param location the origin of the document

@ -28,9 +28,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
@ -136,15 +134,9 @@ public class odtParser extends AbstractParser implements Idiom {
// content.xml contains the document content in xml format // content.xml contains the document content in xml format
if (entryName.equals("content.xml")) { if (entryName.equals("content.xml")) {
final long contentSize = zipEntry.getSize();
// creating a writer for output // create a writer for output
if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writer = new CharBuffer();
writerFile = File.createTempFile("odtParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
// extract data // extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -28,9 +28,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer; import java.io.Writer;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
@ -122,15 +120,9 @@ public class ooxmlParser extends AbstractParser implements Idiom {
if (entryName.equals("word/document.xml") if (entryName.equals("word/document.xml")
|| entryName.startsWith("ppt/slides/slide") || entryName.startsWith("ppt/slides/slide")
|| entryName.startsWith("xl/worksheets/sheet")) { || entryName.startsWith("xl/worksheets/sheet")) {
final long contentSize = zipEntry.getSize();
// creating a writer for output // create a writer for output
if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writer = new CharBuffer();
writerFile = File.createTempFile("ooxmlParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
// extract data // extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -28,10 +28,8 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.io.Writer; import java.io.Writer;
import java.util.HashSet; import java.util.HashSet;
@ -136,13 +134,8 @@ public class pdfParser extends AbstractParser implements Idiom {
Writer writer = null; Writer writer = null;
File writerFile = null; File writerFile = null;
try { try {
// creating a writer for output // create a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { writer = new CharBuffer();
writerFile = File.createTempFile("pdfParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
final PDFTextStripper stripper = new PDFTextStripper(); final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer); // may throw a NPE stripper.writeText(theDocument, writer); // may throw a NPE
theDocument.close(); theDocument.close();

@ -28,6 +28,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
@ -41,7 +42,6 @@ import net.yacy.document.Idiom;
import net.yacy.document.TextParser; import net.yacy.document.TextParser;
import net.yacy.document.ParserException; import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -69,8 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
super("7zip Archive Parser"); super("7zip Archive Parser");
} }
public Document parse(final DigestURI location, final String mimeType, final String charset, public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false); final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false);
Handler archive; Handler archive;
super.theLogger.logFine("opening 7zip archive..."); super.theLogger.logFine("opening 7zip archive...");
@ -81,7 +80,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
} }
checkInterruption(); checkInterruption();
final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive, final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
maxRamSize, doc, location.getFile()); doc, location.getFile());
super.theLogger.logFine("processing archive contents..."); super.theLogger.logFine("processing archive contents...");
try { try {
archive.Extract(null, -1, 0, aec); archive.Extract(null, -1, 0, aec);
@ -102,14 +101,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
@Override @Override
public Document parse(final DigestURI location, final String mimeType, final String charset, public Document parse(final DigestURI location, final String mimeType, final String charset,
final byte[] source) throws ParserException, InterruptedException { final byte[] source) throws ParserException, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Idiom.MAX_KEEP_IN_MEMORY_SIZE - source.length); return parse(location, mimeType, charset, new ByteArrayIInStream(source));
} }
@Override @Override
public Document parse(final DigestURI location, final String mimeType, final String charset, public Document parse(final DigestURI location, final String mimeType, final String charset,
final File sourceFile) throws ParserException, InterruptedException { final File sourceFile) throws ParserException, InterruptedException {
try { try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE); return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"));
} catch (final IOException e) { } catch (final IOException e) {
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
} }
@ -118,12 +117,9 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public Document parse(final DigestURI location, final String mimeType, final String charset, public Document parse(final DigestURI location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException { final InputStream source) throws ParserException, InterruptedException {
try { try {
final CachedFileOutputStream cfos = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE); final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos); FileUtils.copy(source, cfos);
if (cfos.isFallback()) { return parse(location, mimeType, charset, new ByteArrayInputStream(cfos.toByteArray()));
return parse(location, mimeType, charset, cfos.getContentFile());
}
return parse(location, mimeType, charset, cfos.getContentBAOS());
} catch (final IOException e) { } catch (final IOException e) {
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location); throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
} }
@ -143,16 +139,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public static class SZParserExtractCallback extends ArchiveExtractCallback { public static class SZParserExtractCallback extends ArchiveExtractCallback {
private final Log log; private final Log log;
private final long maxRamSize; private ByteArrayOutputStream cfos = null;
private CachedFileOutputStream cfos = null;
private final Document doc; private final Document doc;
private final String prefix; private final String prefix;
public SZParserExtractCallback(final Log logger, final IInArchive handler, public SZParserExtractCallback(final Log logger, final IInArchive handler,
final long maxRamSize, final Document doc, final String prefix) { final Document doc, final String prefix) {
super.Init(handler); super.Init(handler);
this.log = logger; this.log = logger;
this.maxRamSize = maxRamSize;
this.doc = doc; this.doc = doc;
this.prefix = prefix; this.prefix = prefix;
} }
@ -197,11 +191,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
// below for reversion of the effects // below for reversion of the effects
final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath); final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) { theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {
theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentBAOS());
}
this.doc.addSubDocument(theDoc); this.doc.addSubDocument(theDoc);
} }
@ -227,8 +217,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
ex.initCause(e); ex.initCause(e);
throw ex; throw ex;
} }
this.cfos = (item.isDirectory()) ? null this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream();
: new CachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
return this.cfos; return this.cfos;
} }

@ -27,9 +27,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.BufferedOutputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.Arrays; import java.util.Arrays;
@ -90,12 +88,7 @@ public class tarParser extends AbstractParser implements Idiom {
File outputFile = null; File outputFile = null;
Document subDoc = null; Document subDoc = null;
try { try {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { docText = new ByteBuffer();
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new ByteBuffer();
}
/* /*
* If the mimeType was not reported correcly by the webserve we * If the mimeType was not reported correcly by the webserve we

@ -27,9 +27,7 @@
package net.yacy.document.parser; package net.yacy.document.parser;
import java.io.BufferedOutputStream;
import java.io.File; import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream; import java.io.InputStream;
import java.io.OutputStream; import java.io.OutputStream;
import java.util.Arrays; import java.util.Arrays;
@ -91,12 +89,7 @@ public class zipParser extends AbstractParser implements Idiom {
File outputFile = null; File outputFile = null;
Document subDoc = null; Document subDoc = null;
try { try {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) { docText = new ByteBuffer();
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new ByteBuffer();
}
final StringBuilder docKeywords = new StringBuilder(); final StringBuilder docKeywords = new StringBuilder();
final StringBuilder docLongTitle = new StringBuilder(); final StringBuilder docLongTitle = new StringBuilder();

@ -1,157 +0,0 @@
// FileFallbackByteArrayOutputStream.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.io;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import net.yacy.kelondro.util.FileUtils;
public class CachedFileOutputStream extends ByteArrayOutputStream {
protected File fallbackFile;
protected long fallbackSize;
protected boolean buffered;
protected long size = 0;
protected boolean isFallback = false;
protected OutputStream fallback = null;
public CachedFileOutputStream(final long fallbackSize) throws IOException {
this(fallbackSize, null, true, 32);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered)
throws IOException {
this(fallbackSize, fallback, buffered, 32);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered,
final long size) throws IOException {
this.fallbackSize = fallbackSize;
this.fallbackFile = (fallback == null) ? File.createTempFile(
CachedFileOutputStream.class.getName(),
Long.toString(System.currentTimeMillis())) : fallback;
this.buffered = buffered;
checkFallback(size);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered,
final byte[] data) throws IOException {
this(fallbackSize, fallback, buffered, 0);
super.buf = data;
super.count = data.length;
checkFallback(this.size = data.length);
}
protected boolean checkFallback(final long size) {
if (size > this.fallbackSize) try {
fallback();
return true;
} catch (final IOException e) {
throw new RuntimeException("error falling back to file", e);
}
return false;
}
public void fallback() throws IOException {
if (this.isFallback) return;
this.isFallback = true;
if (!this.fallbackFile.exists()) {
this.fallbackFile.createNewFile();
} else if (this.fallbackFile.isDirectory()) {
throw new IOException("cannot write on a directory");
}
final OutputStream os = new FileOutputStream(this.fallbackFile);
this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
FileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
super.buf = new byte[0];
super.count = 0;
super.reset();
}
public boolean isFallback() {
return this.isFallback;
}
public synchronized void write(final int b) {
if (checkFallback(++this.size)) try {
this.fallback.write(b);
} catch (final IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b);
}
}
public synchronized void write(final byte[] b, final int off, final int len) {
if (checkFallback(this.size += len)) try {
this.fallback.write(b, off, len);
} catch (final IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b, off, len);
}
}
public void close() throws IOException {
if (this.fallback != null)
this.fallback.close();
super.close();
}
public InputStream getContent() throws IOException {
close();
if (this.isFallback) {
final InputStream is = new FileInputStream(this.fallbackFile);
return (this.buffered) ? new BufferedInputStream(is) : is;
}
return new ByteArrayInputStream(this.buf);
}
public byte[] getContentBAOS() {
if (this.isFallback)
throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
return super.buf;
}
public File getContentFile() {
if (!this.isFallback)
throw new RuntimeException("haven't fallen back yet, fallback file has no content");
return this.fallbackFile;
}
public long getLength() {
return this.size;
}
}
Loading…
Cancel
Save