removed usage of temporary files: causes too much IO

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6813 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 48b9371735
commit f204076d25

@ -133,6 +133,7 @@ public class Client {
conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
localHostConfiguration.setHost("127.0.0.1");
conManager.getParams().setMaxConnectionsPerHost(localHostConfiguration, 100);
conManager.getParams().setReceiveBufferSize(16 * 1024 * 1024); // set this high to avoid storage in temporary files
// only one retry
apacheHttpClient.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,

@ -47,7 +47,6 @@ import java.util.TreeSet;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.FileUtils;
@ -63,7 +62,7 @@ public class Document {
private final StringBuilder creator; // author or copyright
private final List<String> sections; // if present: more titles/headlines appearing in the document
private final StringBuilder description; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
private Object text; // the clear text, all that is visible
private final Map<DigestURI, String> anchors; // all links embedded as clickeable entities (anchor tags)
private final HashMap<String, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
@ -104,12 +103,9 @@ public class Document {
this.languages = languages;
this.indexingDenied = indexingDenied;
if (text == null) try {
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
} catch (final IOException e) {
Log.logException(e);
this.text = new StringBuilder();
} else {
if (text == null)
this.text = new ByteArrayOutputStream();
else {
this.text = text;
}
}
@ -234,9 +230,9 @@ dc_rights
if (this.text instanceof File) {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
} else if (this.text instanceof byte[]) {
this.textStream = new ByteArrayInputStream((byte[])this.text);
} else if (this.text instanceof CachedFileOutputStream) {
return ((CachedFileOutputStream)this.text).getContent();
this.textStream = new ByteArrayInputStream((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) {
this.textStream = new ByteArrayInputStream(((ByteArrayOutputStream) this.text).toByteArray());
}
return this.textStream;
} catch (final Exception e) {
@ -253,12 +249,8 @@ dc_rights
return FileUtils.read((File)this.text);
} else if (this.text instanceof byte[]) {
return (byte[])this.text;
} else if (this.text instanceof CachedFileOutputStream) {
final CachedFileOutputStream ffbaos = (CachedFileOutputStream)this.text;
if (ffbaos.isFallback()) {
return FileUtils.read(ffbaos.getContent());
}
return ffbaos.getContentBAOS();
} else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream) this.text).toByteArray();
}
} catch (final Exception e) {
Log.logException(e);
@ -268,10 +260,10 @@ dc_rights
public long getTextLength() {
if (this.text == null) return 0;
if (this.text instanceof File) return ((File)this.text).length();
else if (this.text instanceof byte[]) return ((byte[])this.text).length;
else if (this.text instanceof CachedFileOutputStream) {
return ((CachedFileOutputStream)this.text).getLength();
if (this.text instanceof File) return ((File) this.text).length();
else if (this.text instanceof byte[]) return ((byte[]) this.text).length;
else if (this.text instanceof ByteArrayOutputStream) {
return ((ByteArrayOutputStream)this.text).size();
}
return -1;
@ -506,11 +498,10 @@ dc_rights
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
if (!(this.text instanceof CachedFileOutputStream)) {
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
FileUtils.copy(getText(), (CachedFileOutputStream)this.text);
if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream();
}
FileUtils.copy(doc.getText(), (CachedFileOutputStream)this.text);
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
anchors.putAll(doc.getAnchors());
ContentScraper.addAllImages(images, doc.getImages());

@ -39,10 +39,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
* @version $LastChangedRevision$ / $LastChangedDate$
*/
public interface Idiom {
public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;
/**
* Parsing a document available as byte array
* @param location the origin of the document

@ -28,9 +28,7 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Enumeration;
import java.util.HashSet;
@ -136,15 +134,9 @@ public class odtParser extends AbstractParser implements Idiom {
// content.xml contains the document content in xml format
if (entryName.equals("content.xml")) {
final long contentSize = zipEntry.getSize();
// creating a writer for output
if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("odtParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
// create a writer for output
writer = new CharBuffer();
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -28,9 +28,7 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.Enumeration;
import java.util.HashSet;
@ -122,15 +120,9 @@ public class ooxmlParser extends AbstractParser implements Idiom {
if (entryName.equals("word/document.xml")
|| entryName.startsWith("ppt/slides/slide")
|| entryName.startsWith("xl/worksheets/sheet")) {
final long contentSize = zipEntry.getSize();
// creating a writer for output
if ((contentSize == -1) || (contentSize > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("ooxmlParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
// create a writer for output
writer = new CharBuffer();
// extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -28,10 +28,8 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.HashSet;
@ -136,13 +134,8 @@ public class pdfParser extends AbstractParser implements Idiom {
Writer writer = null;
File writerFile = null;
try {
// creating a writer for output
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
writerFile = File.createTempFile("pdfParser",".prt");
writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
} else {
writer = new CharBuffer();
}
// create a writer for output
writer = new CharBuffer();
final PDFTextStripper stripper = new PDFTextStripper();
stripper.writeText(theDocument, writer); // may throw a NPE
theDocument.close();

@ -28,6 +28,7 @@
package net.yacy.document.parser;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
@ -41,7 +42,6 @@ import net.yacy.document.Idiom;
import net.yacy.document.TextParser;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CachedFileOutputStream;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -69,8 +69,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
super("7zip Archive Parser");
}
public Document parse(final DigestURI location, final String mimeType, final String charset,
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
public Document parse(final DigestURI location, final String mimeType, final String charset, final IInStream source) throws ParserException, InterruptedException {
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false);
Handler archive;
super.theLogger.logFine("opening 7zip archive...");
@ -81,7 +80,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
}
checkInterruption();
final SZParserExtractCallback aec = new SZParserExtractCallback(super.theLogger, archive,
maxRamSize, doc, location.getFile());
doc, location.getFile());
super.theLogger.logFine("processing archive contents...");
try {
archive.Extract(null, -1, 0, aec);
@ -102,14 +101,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset,
final byte[] source) throws ParserException, InterruptedException {
return parse(location, mimeType, charset, new ByteArrayIInStream(source), Idiom.MAX_KEEP_IN_MEMORY_SIZE - source.length);
return parse(location, mimeType, charset, new ByteArrayIInStream(source));
}
@Override
public Document parse(final DigestURI location, final String mimeType, final String charset,
final File sourceFile) throws ParserException, InterruptedException {
try {
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"), Idiom.MAX_KEEP_IN_MEMORY_SIZE);
return parse(location, mimeType, charset, new MyRandomAccessFile(sourceFile, "r"));
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
@ -118,12 +117,9 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public Document parse(final DigestURI location, final String mimeType, final String charset,
final InputStream source) throws ParserException, InterruptedException {
try {
final CachedFileOutputStream cfos = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
final ByteArrayOutputStream cfos = new ByteArrayOutputStream();
FileUtils.copy(source, cfos);
if (cfos.isFallback()) {
return parse(location, mimeType, charset, cfos.getContentFile());
}
return parse(location, mimeType, charset, cfos.getContentBAOS());
return parse(location, mimeType, charset, new ByteArrayInputStream(cfos.toByteArray()));
} catch (final IOException e) {
throw new ParserException("error processing 7zip archive: " + e.getMessage(), location);
}
@ -143,16 +139,14 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public static class SZParserExtractCallback extends ArchiveExtractCallback {
private final Log log;
private final long maxRamSize;
private CachedFileOutputStream cfos = null;
private ByteArrayOutputStream cfos = null;
private final Document doc;
private final String prefix;
public SZParserExtractCallback(final Log logger, final IInArchive handler,
final long maxRamSize, final Document doc, final String prefix) {
final Document doc, final String prefix) {
super.Init(handler);
this.log = logger;
this.maxRamSize = maxRamSize;
this.doc = doc;
this.prefix = prefix;
}
@ -197,11 +191,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
// below for reversion of the effects
final DigestURI url = DigestURI.newURL(doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
if (this.cfos.isFallback()) {
theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentFile());
} else {
theDoc = TextParser.parseSource(url, mime, null, this.cfos.getContentBAOS());
}
theDoc = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
this.doc.addSubDocument(theDoc);
}
@ -227,8 +217,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
ex.initCause(e);
throw ex;
}
this.cfos = (item.isDirectory()) ? null
: new CachedFileOutputStream(this.maxRamSize, null, true, item.getSize());
this.cfos = (item.isDirectory()) ? null : new ByteArrayOutputStream();
return this.cfos;
}

@ -27,9 +27,7 @@
package net.yacy.document.parser;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
@ -90,12 +88,7 @@ public class tarParser extends AbstractParser implements Idiom {
File outputFile = null;
Document subDoc = null;
try {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new ByteBuffer();
}
docText = new ByteBuffer();
/*
* If the mimeType was not reported correcly by the webserve we

@ -27,9 +27,7 @@
package net.yacy.document.parser;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Arrays;
@ -91,12 +89,7 @@ public class zipParser extends AbstractParser implements Idiom {
File outputFile = null;
Document subDoc = null;
try {
if ((this.contentLength == -1) || (this.contentLength > Idiom.MAX_KEEP_IN_MEMORY_SIZE)) {
outputFile = File.createTempFile("zipParser",".prt");
docText = new BufferedOutputStream(new FileOutputStream(outputFile));
} else {
docText = new ByteBuffer();
}
docText = new ByteBuffer();
final StringBuilder docKeywords = new StringBuilder();
final StringBuilder docLongTitle = new StringBuilder();

@ -1,157 +0,0 @@
// FileFallbackByteArrayOutputStream.java
// -------------------------------------
// part of YACY
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
//
// This file ist contributed by Franz Brausze
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.io;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import net.yacy.kelondro.util.FileUtils;
public class CachedFileOutputStream extends ByteArrayOutputStream {
protected File fallbackFile;
protected long fallbackSize;
protected boolean buffered;
protected long size = 0;
protected boolean isFallback = false;
protected OutputStream fallback = null;
public CachedFileOutputStream(final long fallbackSize) throws IOException {
this(fallbackSize, null, true, 32);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered)
throws IOException {
this(fallbackSize, fallback, buffered, 32);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered,
final long size) throws IOException {
this.fallbackSize = fallbackSize;
this.fallbackFile = (fallback == null) ? File.createTempFile(
CachedFileOutputStream.class.getName(),
Long.toString(System.currentTimeMillis())) : fallback;
this.buffered = buffered;
checkFallback(size);
}
public CachedFileOutputStream(final long fallbackSize, final File fallback, final boolean buffered,
final byte[] data) throws IOException {
this(fallbackSize, fallback, buffered, 0);
super.buf = data;
super.count = data.length;
checkFallback(this.size = data.length);
}
protected boolean checkFallback(final long size) {
if (size > this.fallbackSize) try {
fallback();
return true;
} catch (final IOException e) {
throw new RuntimeException("error falling back to file", e);
}
return false;
}
public void fallback() throws IOException {
if (this.isFallback) return;
this.isFallback = true;
if (!this.fallbackFile.exists()) {
this.fallbackFile.createNewFile();
} else if (this.fallbackFile.isDirectory()) {
throw new IOException("cannot write on a directory");
}
final OutputStream os = new FileOutputStream(this.fallbackFile);
this.fallback = (this.buffered) ? new BufferedOutputStream(os) : os;
FileUtils.copy(new ByteArrayInputStream(super.buf), this.fallback);
super.buf = new byte[0];
super.count = 0;
super.reset();
}
public boolean isFallback() {
return this.isFallback;
}
public synchronized void write(final int b) {
if (checkFallback(++this.size)) try {
this.fallback.write(b);
} catch (final IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b);
}
}
public synchronized void write(final byte[] b, final int off, final int len) {
if (checkFallback(this.size += len)) try {
this.fallback.write(b, off, len);
} catch (final IOException e) {
throw new RuntimeException("error writing to fallback", e);
} else {
super.write(b, off, len);
}
}
public void close() throws IOException {
if (this.fallback != null)
this.fallback.close();
super.close();
}
public InputStream getContent() throws IOException {
close();
if (this.isFallback) {
final InputStream is = new FileInputStream(this.fallbackFile);
return (this.buffered) ? new BufferedInputStream(is) : is;
}
return new ByteArrayInputStream(this.buf);
}
public byte[] getContentBAOS() {
if (this.isFallback)
throw new RuntimeException("underlying ByteArrayOutputStream not available, already fell back to file");
return super.buf;
}
public File getContentFile() {
if (!this.isFallback)
throw new RuntimeException("haven't fallen back yet, fallback file has no content");
return this.fallbackFile;
}
public long getLength() {
return this.size;
}
}
Loading…
Cancel
Save