*) Better handling of large files during parsing

Extracted text of files that are larger than 5MB is stored in a temp file instead of keeping it in memory *) plasmaParserDocument.java; getText now returnes an inputStream instead of a byte array *) plasmaParserDocument.java: new function getTextBytes returns the parsed content as byte array Attention: the caller of this function has to ensure that enough memory is available to do this to avoid OutOfMemory Exceptions *) httpd.java: better error handling if the soaphander is not installed *) pdfParser.java: - better handling of documents with exotic charsets - better handling of large documents - better error logging of encrypted documents *) rtfParser.java: Bugfix for UTF-8 support *) tarParser.java: better handling of large documents *) zipParser.java: better handling of large documents *) plasmaCrawlEURL.java: new errorcode for encrypted documents *) plasmaParserDocument.java: the extracted text can now be passed to this object as byte array or temp file git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2679 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · cd5f349666
parent 8b2ceddb91
commit cd5f349666
12 changed files with 309 additions and 114 deletions
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -216,7 +216,7 @@ public class ViewFile {
                resMime = document.getMimeType();

                if (viewMode.equals("parsed")) {
-                    String content = new String(document.getText());
+                    String content = new String(document.getTextBytes());
                    content = wikiCode.replaceHTML(content); //added by Marc Nause
                    content = content.replaceAll("\n","<br>")
                    .replaceAll("\t","&nbsp;&nbsp;&nbsp;&nbsp;");
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@ -406,24 +406,24 @@ public final class httpd implements serverHandler {
                    /*
                     * Handling SOAP Requests here ...
                     */
-                    if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap")) {
-                        if (soapHandler == null) {
+                    if (this.prop.containsKey(httpHeader.CONNECTION_PROP_PATH) && this.prop.getProperty(httpHeader.CONNECTION_PROP_PATH).startsWith("/soap/")) {
+                        if (this.soapHandler == null) {
                            try {
                                Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
                                Constructor classConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
-                                soapHandler  = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
+                                this.soapHandler  = (httpdHandler) classConstructor.newInstance(new Object[] { switchboard });
                            } catch (Exception e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;
                            } catch (NoClassDefFoundError e) {
-                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;
                            } catch (Error e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;                                
                            }
                        }
-                        soapHandler.doGet(this.prop, header, this.session.out);
+                        this.soapHandler.doGet(this.prop, header, this.session.out);
                        
                        /*
                         * Handling HTTP requests here ...
@ -541,7 +541,7 @@ public final class httpd implements serverHandler {
            
            // we now know the HTTP version. depending on that, we read the header
            httpHeader header;
-            String httpVersion = prop.getProperty("HTTP", "HTTP/0.9");
+            String httpVersion = this.prop.getProperty("HTTP", "HTTP/0.9");
            if (httpVersion.equals("HTTP/0.9"))  header = new httpHeader(reverseMappingCache);
            else header = httpHeader.readHeader(this.prop,this.session);
            
@ -559,8 +559,8 @@ public final class httpd implements serverHandler {
                    /*
                     * Handling SOAP Requests here ...
                     */
-                    if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap")) {
-                        if (soapHandler == null) {
+                    if (this.prop.containsKey("PATH") && this.prop.getProperty("PATH").startsWith("/soap/")) {
+                        if (this.soapHandler == null) {
                            try {
                                // creating the soap handler class by name
                                Class soapHandlerClass = Class.forName("de.anomic.soap.httpdSoapHandler");
@ -569,19 +569,19 @@ public final class httpd implements serverHandler {
                                Constructor soapHandlerConstructor = soapHandlerClass.getConstructor( new Class[] { serverSwitch.class } );
                                
                                // creating the new object
-                                soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );   
+                                this.soapHandler = (httpdHandler)soapHandlerConstructor.newInstance( new Object[] { switchboard } );   
                            } catch (Exception e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;
                            } catch (NoClassDefFoundError e) {
-                                sendRespondError(this.prop,this.session.out,4,503,"SOAP Extension not installed","SOAP Extension not installed",null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;                                
                            } catch (Error e) {
-                                sendRespondHeader(this.prop,this.session.out,httpVersion,503,null);
+                                sendRespondError(this.prop,this.session.out,4,503,null,"SOAP Extension not installed",e);
                                return serverCore.TERMINATE_CONNECTION;                                
                            }
                        }
-                        soapHandler.doPost(prop, header, this.session.out, this.session.in);                
+                        this.soapHandler.doPost(this.prop, header, this.session.out, this.session.in);                
                        /*
                         * Handling normal HTTP requests here ...
                         */
@ -981,7 +981,7 @@ public final class httpd implements serverHandler {
            int httpStatusCode,            
            String httpStatusText,
            String detailedErrorMsg,
-            Exception stackTrace
+            Throwable stackTrace
    ) throws IOException {
        sendRespondError(
                conProp,
@ -1004,7 +1004,7 @@ public final class httpd implements serverHandler {
            String httpStatusText,
            File detailedErrorMsgFile,
            serverObjects detailedErrorMsgValues,
-            Exception stackTrace
+            Throwable stackTrace
    ) throws IOException {
        sendRespondError(
                conProp,
@ -1029,7 +1029,7 @@ public final class httpd implements serverHandler {
            String detailedErrorMsgText,
            Object detailedErrorMsgFile,
            serverObjects detailedErrorMsgValues,
-            Exception stackTrace,
+            Throwable stackTrace,
            httpHeader header
    ) throws IOException {
        
--- a/source/de/anomic/plasma/parser/AbstractParser.java
+++ b/source/de/anomic/plasma/parser/AbstractParser.java
@ -64,7 +64,7 @@ import de.anomic.server.logging.serverLog;
 * @version $LastChangedRevision$ / $LastChangedDate$
 */
 public abstract class AbstractParser implements Parser{
-
+    
    /**
     * a list of library names that are needed by this parser
     */
@ -86,6 +86,12 @@ public abstract class AbstractParser implements Parser{
     */
    protected String parserName = this.getClass().getName();
    
+    /**
+     * The source file file size in bytes if the source document was passed
+     * in as file
+     */
+    protected long fileSize = -1;
+    
    /**
     * The Constructor of this class.
     */
@ -178,6 +184,9 @@ public abstract class AbstractParser implements Parser{
 	) throws ParserException, InterruptedException {
        BufferedInputStream contentInputStream = null;
        try {
+            // getting the file size of the document
+            this.fileSize = sourceFile.length();            
+            
            // create a stream from the file
            contentInputStream = new BufferedInputStream(new FileInputStream(sourceFile));
            
--- a/source/de/anomic/plasma/parser/Parser.java
+++ b/source/de/anomic/plasma/parser/Parser.java
@ -60,6 +60,9 @@ import de.anomic.server.logging.serverLog;
 */
 public interface Parser {
    
+
+    public static long MAX_KEEP_IN_MEMORY_SIZE = 5 * 1024 * 1024;    
+    
    /**
     * Parsing a document available as byte array
     * @param location the origin of the document 
--- a/source/de/anomic/plasma/parser/pdf/pdfParser.java
+++ b/source/de/anomic/plasma/parser/pdf/pdfParser.java
@ -43,8 +43,11 @@

 package de.anomic.plasma.parser.pdf;

+import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.Hashtable;

 import org.pdfbox.pdfparser.PDFParser;
@ -53,11 +56,12 @@ import org.pdfbox.pdmodel.PDDocumentInformation;
 import org.pdfbox.util.PDFTextStripper;

 import de.anomic.net.URL;
+import de.anomic.plasma.plasmaCrawlEURL;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.AbstractParser;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
-import de.anomic.server.serverByteBuffer;
+import de.anomic.server.serverCharBuffer;

 public class pdfParser extends AbstractParser implements Parser {

@ -87,9 +91,9 @@ public class pdfParser extends AbstractParser implements Parser {
    
    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
-        
        PDDocument theDocument = null;
-        OutputStreamWriter writer = null;
+        Writer writer = null;
+        File writerFile = null;
        try {       
            // reducing thread priority
            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);                        
@ -114,6 +118,10 @@ public class pdfParser extends AbstractParser implements Parser {
            PDFTextStripper stripper = new PDFTextStripper();
            theDocument = parser.getPDDocument();
            
+            if (theDocument.isEncrypted()) {
+                throw new ParserException("Document is encrypted",location,plasmaCrawlEURL.DENIED_DOCUMENT_ENCRYPTED);
+            }
+            
            // extracting some metadata
            PDDocumentInformation theDocInfo = theDocument.getDocumentInformation();            
            if (theDocInfo != null) {
@ -121,42 +129,54 @@ public class pdfParser extends AbstractParser implements Parser {
                docSubject = theDocInfo.getSubject();
                //docAuthor = theDocInfo.getAuthor();
                docKeywordStr = theDocInfo.getKeywords();
-            }
-            
-            serverByteBuffer out = new serverByteBuffer();
-            writer = new OutputStreamWriter( out );            
-            stripper.writeText(theDocument, writer );
-            
-            writer.close(); writer = null;
-            theDocument.close(); theDocument = null;
+            }            
            
-            byte[] contents = out.toByteArray();
-            out.close();
-            out = null;
-			
-            if ((docTitle == null) || (docTitle.length() == 0)) {
-                docTitle = ((contents.length > 80)? new String(contents, 0, 80, "UTF-8"):new String(contents, "UTF-8")).
-                replaceAll("\r\n"," ").
-                replaceAll("\n"," ").
-                replaceAll("\r"," ").
-                replaceAll("\t"," ");                
+            // creating a writer for output
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                writerFile = File.createTempFile("pdfParser",".tmp");
+                writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
+            } else {
+                writer = new serverCharBuffer(); 
            }
+
+            stripper.writeText(theDocument, writer );
+            theDocument.close(); theDocument = null;            
+            writer.close();
+
            
            String[] docKeywords = null;
            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
            
-            plasmaParserDocument theDoc = new plasmaParserDocument(
-                    location,
-                    mimeType,
-                    "UTF-8",
-                    docKeywords,
-                    docSubject,
-                    docTitle,
-                    null,
-                    null,
-                    contents,
-                    null,
-                    null);
+            plasmaParserDocument theDoc = null;
+            
+            if (writer instanceof serverCharBuffer) {
+                byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
+                theDoc = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        docKeywords,
+                        docSubject,
+                        docTitle,
+                        null,
+                        null,
+                        contentBytes,
+                        null,
+                        null);
+            } else {
+                theDoc = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        docKeywords,
+                        docSubject,
+                        docTitle,
+                        null,
+                        null,
+                        writerFile,
+                        null,
+                        null);                
+            }
            
            return theDoc;
        }
@ -164,6 +184,12 @@ public class pdfParser extends AbstractParser implements Parser {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
+            // close the writer
+            if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (writerFile != null) try { writerFile.delete(); } catch (Exception ex)  {/* ignore this */}
+            
            throw new ParserException("Unexpected error while parsing pdf file. " + e.getMessage(),location); 
        } finally {
            if (theDocument != null) try { theDocument.close(); } catch (Exception e) {/* ignore this */}
@ -173,8 +199,7 @@ public class pdfParser extends AbstractParser implements Parser {
    }
    
    public void reset() {
-		// Nothing todo here at the moment
-    	
+        this.fileSize = -1;
    }

 }
--- a/source/de/anomic/plasma/parser/rtf/rtfParser.java
+++ b/source/de/anomic/plasma/parser/rtf/rtfParser.java
@ -80,8 +80,7 @@ implements Parser {
        this.parserName = "Rich Text Format Parser";  
 	}

-	public plasmaParserDocument parse(URL location, String mimeType, String charset,
-			InputStream source) throws ParserException, InterruptedException {
+	public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {

        
 		try {	
@ -105,7 +104,7 @@ implements Parser {
                    null,
                    null,
                    null,
-                    bodyText.getBytes(),
+                    bodyText.getBytes("UTF-8"),
                    null,
                    null);
            
--- a/source/de/anomic/plasma/parser/tar/tarParser.java
+++ b/source/de/anomic/plasma/parser/tar/tarParser.java
@ -43,8 +43,11 @@

 package de.anomic.plasma.parser.tar;

+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
@ -96,7 +99,18 @@ public class tarParser extends AbstractParser implements Parser {
    
    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
+        long docTextLength = 0;
+        OutputStream docText = null;
+        File outputFile = null;
+        plasmaParserDocument subDoc = null;        
        try {           
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                outputFile = File.createTempFile("zipParser",".tmp");
+                docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+            } else {
+                docText = new serverByteBuffer();
+            }            
+            
            // creating a new parser class to parse the unzipped content
            plasmaParser theParser = new plasmaParser();       
            
@ -116,7 +130,7 @@ public class tarParser extends AbstractParser implements Parser {
            StringBuffer docLongTitle = new StringBuffer();   
            LinkedList docSections = new LinkedList();
            StringBuffer docAbstrct = new StringBuffer();
-            serverByteBuffer docText = new serverByteBuffer();
+
            Map docAnchors = new HashMap();
            TreeSet docImages = new TreeSet(); 
                        
@ -141,55 +155,58 @@ public class tarParser extends AbstractParser implements Parser {
                String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);
                
                // getting the entry content
-                plasmaParserDocument theDoc = null;
-                File tempFile = null;
+                File subDocTempFile = null;
                try {
                    // create the temp file
-                    tempFile = createTempFile(entryName);
+                    subDocTempFile = createTempFile(entryName);
                    
                    // copy the data into the file
-                    serverFileUtils.copy(tin,tempFile,entry.getSize());
+                    serverFileUtils.copy(tin,subDocTempFile,entry.getSize());
                    
                    // check for interruption
                    checkInterruption();
                    
                    // parsing the content                    
-                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,tempFile);
+                    subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null,subDocTempFile);
                } catch (ParserException e) {
                    this.theLogger.logInfo("Unable to parse tar file entry '" + entryName + "'. " + e.getMessage());
                } finally {
-                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+                    if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
                }
-                if (theDoc == null) continue;
+                if (subDoc == null) continue;
                
                // merging all documents together
                if (docKeywords.length() > 0) docKeywords.append(",");
-                docKeywords.append(theDoc.getKeywords(','));
+                docKeywords.append(subDoc.getKeywords(','));
                
                if (docLongTitle.length() > 0) docLongTitle.append("\n");
-                docLongTitle.append(theDoc.getMainLongTitle());
+                docLongTitle.append(subDoc.getMainLongTitle());
                
                if (docShortTitle.length() > 0) docShortTitle.append("\n");
-                docShortTitle.append(theDoc.getMainShortTitle());                
+                docShortTitle.append(subDoc.getMainShortTitle());                
                
-                docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+                docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
                
                if (docAbstrct.length() > 0) docAbstrct.append("\n");
-                docAbstrct.append(theDoc.getAbstract());                   
+                docAbstrct.append(subDoc.getAbstract());                   

-                if (docText.length() > 0) docText.append("\n");
-                docText.append(theDoc.getText());                 
+                if (subDoc.getTextLength() > 0) {
+                    if (docTextLength > 0) docText.write('\n');
+                    docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+                }               
+                
+                docAnchors.putAll(subDoc.getAnchors());
+                docImages.addAll(subDoc.getImages());
                
-                docAnchors.putAll(theDoc.getAnchors());
-                docImages.addAll(theDoc.getImages());
+                // release subdocument
+                subDoc.close();
+                subDoc = null;                
            }
            
-            /* (URL location, String mimeType,
-             String keywords, String shortTitle, String longTitle,
-             String[] sections, String abstrct,
-             byte[] text, Map anchors, Map images)
-             */            
-            return new plasmaParserDocument(
+            plasmaParserDocument result = null;
+            
+            if (docText instanceof serverByteBuffer) {
+                result = new plasmaParserDocument(
                    location,
                    mimeType,
                    null,
@ -198,13 +215,37 @@ public class tarParser extends AbstractParser implements Parser {
                    docLongTitle.toString(),
                    (String[])docSections.toArray(new String[docSections.size()]),
                    docAbstrct.toString(),
-                    docText.toByteArray(),
+                    ((serverByteBuffer)docText).toByteArray(),
                    docAnchors,
                    docImages);
+            } else {
+                result = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        null,
+                        docKeywords.toString().split(" |,"),
+                        docShortTitle.toString(), 
+                        docLongTitle.toString(),
+                        (String[])docSections.toArray(new String[docSections.size()]),
+                        docAbstrct.toString(),
+                        outputFile,
+                        docAnchors,
+                        docImages);                
+            }
+            
+            return result;
        } catch (Exception e) {
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
+            if (subDoc != null) subDoc.close();
+            
+            // close the writer
+            if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (outputFile != null) try { outputFile.delete(); } catch (Exception ex)  {/* ignore this */}               
+            
            throw new ParserException("Unexpected error while parsing tar resource. " + e.getMessage(),location); 
        }
    }
--- a/source/de/anomic/plasma/parser/zip/zipParser.java
+++ b/source/de/anomic/plasma/parser/zip/zipParser.java
@ -43,8 +43,11 @@

 package de.anomic.plasma.parser.zip;

+import java.io.BufferedOutputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.OutputStream;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Hashtable;
@ -94,13 +97,23 @@ public class zipParser extends AbstractParser implements Parser {
    
    public plasmaParserDocument parse(URL location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
        
+        long docTextLength = 0;
+        OutputStream docText = null;
+        File outputFile = null;
+        plasmaParserDocument subDoc = null;
        try {           
+            if ((this.fileSize != -1) && (this.fileSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                outputFile = File.createTempFile("zipParser",".tmp");
+                docText = new BufferedOutputStream(new FileOutputStream(outputFile));
+            } else {
+                docText = new serverByteBuffer();
+            }
+            
            StringBuffer docKeywords = new StringBuffer();
            StringBuffer docShortTitle = new StringBuffer();  
            StringBuffer docLongTitle = new StringBuffer();   
            LinkedList docSections = new LinkedList();
            StringBuffer docAbstrct = new StringBuffer();
-            serverByteBuffer docText = new serverByteBuffer();
            Map docAnchors = new HashMap();
            TreeSet docImages = new TreeSet(); 
            
@ -128,48 +141,56 @@ public class zipParser extends AbstractParser implements Parser {
                String entryMime = plasmaParser.getMimeTypeByFileExt(entryExt);      
                
                // parsing the content
-                plasmaParserDocument theDoc = null;
-                File tempFile = null;
+                File subDocTempFile = null;
                try {
                    // create the temp file
-                    tempFile = createTempFile(entryName);
+                    subDocTempFile = createTempFile(entryName);
                    
                    // copy the data into the file
-                    serverFileUtils.copy(zippedContent,tempFile,entry.getSize());                    
+                    serverFileUtils.copy(zippedContent,subDocTempFile,entry.getSize());                    
                    
                    // parsing the zip file entry
-                    theDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, tempFile);
+                    subDoc = theParser.parseSource(new URL(location,"#" + entryName),entryMime,null, subDocTempFile);
                } catch (ParserException e) {
                    this.theLogger.logInfo("Unable to parse zip file entry '" + entryName + "'. " + e.getMessage());
                } finally {
-                    if (tempFile != null) try {tempFile.delete(); } catch(Exception ex){/* ignore this */}
+                    if (subDocTempFile != null) try {subDocTempFile.delete(); } catch(Exception ex){/* ignore this */}
                }
-                if (theDoc == null) continue;
+                if (subDoc == null) continue;
                
                // merging all documents together
                if (docKeywords.length() > 0) docKeywords.append(",");
-                docKeywords.append(theDoc.getKeywords(','));
+                docKeywords.append(subDoc.getKeywords(','));
                
                if (docLongTitle.length() > 0) docLongTitle.append("\n");
-                docLongTitle.append(theDoc.getMainLongTitle());
+                docLongTitle.append(subDoc.getMainLongTitle());
                
                if (docShortTitle.length() > 0) docShortTitle.append("\n");
-                docShortTitle.append(theDoc.getMainShortTitle());                
+                docShortTitle.append(subDoc.getMainShortTitle());                
                
-                docSections.addAll(Arrays.asList(theDoc.getSectionTitles()));
+                docSections.addAll(Arrays.asList(subDoc.getSectionTitles()));
                
                if (docAbstrct.length() > 0) docAbstrct.append("\n");
-                docAbstrct.append(theDoc.getAbstract());                   
+                docAbstrct.append(subDoc.getAbstract());                   

-                if (docText.length() > 0) docText.append("\n");
-                docText.append(theDoc.getText());                 
+                if (subDoc.getTextLength() > 0) {
+                    if (docTextLength > 0) docText.write('\n');
+                    docTextLength += serverFileUtils.copy(subDoc.getText(), docText);
+                }
                
-                docAnchors.putAll(theDoc.getAnchors());
-                docImages.addAll(theDoc.getImages());
+                docAnchors.putAll(subDoc.getAnchors());
+                docImages.addAll(subDoc.getImages());
+                
+                // release subdocument
+                subDoc.close();
+                subDoc = null;
            }
            
        
-            return new plasmaParserDocument(
+            plasmaParserDocument result = null;
+            
+            if (docText instanceof serverByteBuffer) {
+                result = new plasmaParserDocument(
                    location,
                    mimeType,
                    null,
@ -178,13 +199,37 @@ public class zipParser extends AbstractParser implements Parser {
                    docLongTitle.toString(),
                    (String[])docSections.toArray(new String[docSections.size()]),
                    docAbstrct.toString(),
-                    docText.toByteArray(),
+                    ((serverByteBuffer)docText).toByteArray(),
                    docAnchors,
                    docImages);
+            } else {
+                result = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        null,
+                        docKeywords.toString().split(" |,"),
+                        docShortTitle.toString(), 
+                        docLongTitle.toString(),
+                        (String[])docSections.toArray(new String[docSections.size()]),
+                        docAbstrct.toString(),
+                        outputFile,
+                        docAnchors,
+                        docImages);                
+            }
+            
+            return result;
        } catch (Exception e) {  
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
+            if (subDoc != null) subDoc.close();
+            
+            // close the writer
+            if (docText != null) try { docText.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (outputFile != null) try { outputFile.delete(); } catch (Exception ex)  {/* ignore this */}            
+            
            throw new ParserException("Unexpected error while parsing zip resource. " + e.getClass().getName() + ": "+ e.getMessage(),location);
        }
    }
--- a/source/de/anomic/plasma/plasmaCrawlEURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlEURL.java
@ -117,6 +117,7 @@ public class plasmaCrawlEURL extends indexURL {
    
    // Parser errors
    public static final String DENIED_PARSER_ERROR = "denied_(parser_error)";
+    public static final String DENIED_DOCUMENT_ENCRYPTED = "denied_(document_encrypted)";
    public static final String DENIED_NOT_PARSEABLE_NO_CONTENT = "denied_(not_parseabel_no_content)";
    
    // indexing errors
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -515,7 +515,7 @@ public final class plasmaParser {
            
            // testing if the resource is not empty
            if (!(sourceFile.exists() && sourceFile.canRead() && sourceFile.length() > 0)) {
-                String errorMsg = "No resource content available.";
+                String errorMsg = sourceFile.exists() ? "Empty resource file." : "No resource content available.";
                this.theLogger.logInfo("Unable to parse '" + location + "'. " + errorMsg);
                throw new ParserException(errorMsg,location,plasmaCrawlEURL.DENIED_NOT_PARSEABLE_NO_CONTENT);
            }
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@ -42,8 +42,14 @@

 package de.anomic.plasma;

+import java.io.BufferedInputStream;
 import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
 import java.net.MalformedURLException;
+import de.anomic.server.serverFileUtils;
+
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
@ -62,7 +68,7 @@ public class plasmaParserDocument {
    String longTitle;   // the real title of the document, commonly h1-tags
    String[] sections;  // if present: more titles/headlines appearing in the document
    String abstrct;     // an abstract, if present: short content description
-    byte[] text;        // the clear text, all that is visible
+    private Object text;  // the clear text, all that is visible
    Map anchors;        // all links embedded as clickeable entities (anchor tags)
    TreeSet images;     // all visible pictures in document
    // the anchors and images - Maps are URL-to-EntityDescription mappings.
@ -95,6 +101,29 @@ public class plasmaParserDocument {
        this.condenser = null;
        this.resorted = false;
    }
+    
+    public plasmaParserDocument(URL location, String mimeType, String charset,
+            String[] keywords, String shortTitle, String longTitle,
+            String[] sections, String abstrct,
+            File text, Map anchors, TreeSet images) {
+        this.location = location;
+        this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
+        this.charset = charset;
+        this.keywords = (keywords==null) ? new String[0] : keywords;
+        this.shortTitle = (shortTitle==null)?"":shortTitle;
+        this.longTitle = (longTitle==null)?"":longTitle;
+        this.sections = (sections==null)?new String[0]:sections;
+        this.abstrct = (abstrct==null)?"":abstrct;
+        this.text = text;
+        if (text != null) text.deleteOnExit();
+        this.anchors = (anchors==null)?new HashMap(0):anchors;
+        this.images = (images==null)?new TreeSet():images;
+        this.hyperlinks = null;
+        this.medialinks = null;
+        this.emaillinks = null;
+        this.condenser = null;
+        this.resorted = false;
+    }    

    public String getMimeType() {
        return this.mimeType;
@ -103,7 +132,7 @@ public class plasmaParserDocument {
    /**
     * @return the supposed charset of this document or <code>null</code> if unknown
     */
-    public String getCharset() {
+    public String getSourceCharset() {
        return this.charset;
    }
    
@ -123,13 +152,41 @@ public class plasmaParserDocument {
        if (abstrct != null) return abstrct; else return getMainLongTitle();
    }
    
-    public byte[] getText() {
-        // returns only the clear (visible) text (not the source data)
-        return text;
+    public InputStream getText() {
+        try {
+            if (this.text == null) return null;
+
+            if (this.text instanceof File) return new BufferedInputStream(new FileInputStream((File)this.text));
+            else if (this.text instanceof byte[]) return new ByteArrayInputStream((byte[])this.text);
+
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return null; 
+    }
+    
+    public byte[] getTextBytes() {
+        try {
+            if (this.text == null) return new byte[0];
+
+            if (this.text instanceof File) return serverFileUtils.read((File)this.text);
+            else if (this.text instanceof byte[]) return (byte[])this.text;
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+        return new byte[0];             
+    }
+    
+    public long getTextLength() {
+        if (this.text == null) return 0;
+        if (this.text instanceof File) return ((File)this.text).length();
+        else if (this.text instanceof byte[]) return ((byte[])this.text).length;
+        
+        return -1; 
    }
    
    public plasmaCondenser getCondenser() {
-        if (condenser == null) condenser = new plasmaCondenser(new ByteArrayInputStream(getText()), 0, 0);
+        if (condenser == null) condenser = new plasmaCondenser(getText(), 0, 0);
        return condenser;
    }
    
@ -262,4 +319,16 @@ public class plasmaParserDocument {
        this.resorted = true;
    }
    
+    public void close() {
+        // delete the temp file
+        if ((this.text != null) && (this.text instanceof File)) {
+            try { ((File)this.text).delete(); } catch (Exception e) {/* ignore this */}
+        }        
+    }
+    
+    protected void finalize() throws Throwable {
+        this.close();
+        super.finalize();
+    }
+    
 }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1416,6 +1416,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    }
    
    private void processResourceStack(plasmaSwitchboardQueue.Entry entry) throws InterruptedException {
+        plasmaParserDocument document = null;
        try {
            // work off one stack entry with a fresh resource
            long stackStartTime = 0, stackEndTime = 0,
@ -1456,7 +1457,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            /* =========================================================================
             * PARSE CONTENT
             * ========================================================================= */
-            plasmaParserDocument document = null;
            parsingStartTime = System.currentTimeMillis();

            try {
@ -1527,7 +1527,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                
                checkInterruption();
                log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
-                plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
+                plasmaCondenser condenser = new plasmaCondenser(document.getText());
                
                // generate citation reference
                Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser);
@ -1700,8 +1700,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                            log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
                                    " [" + entry.urlHash() + "]" +
                                    "\n\tDescription:  " + docDescription +
-                                    "\n\tMimeType: "  + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
-                                    "Size: " + document.text.length + " bytes | " +
+                                    "\n\tMimeType: "  + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
+                                    "Size: " + document.getTextLength() + " bytes | " +
                                    "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
                                    "\n\tStackingTime:  " + (stackEndTime-stackStartTime) + " ms | " +
                                    "ParsingTime:  " + (parsingEndTime-parsingStartTime) + " ms | " +
@ -1744,6 +1744,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                    yacyClient.crawlReceipt(initiatorPeer, "crawl", "rejected", noIndexReason, null, "");
                }
            }
+            document.close();
            document = null;
        } catch (Exception e) {
            this.log.logSevere("Unexpected exception while parsing/indexing URL ",e);
@ -1772,6 +1773,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
                cacheManager.deleteFile(entry.url());
            }
            entry = null;
+            
+            if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
        }
    }
    
@ -1807,7 +1810,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) +  // count of links to global resources
        kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
        kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) +       // count of links to other documents
-        kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getText().length, 3) +   // length of plain text in bytes
+        kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) +   // length of plain text in bytes
        kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
        kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_SIMI_WORDS, 3) + // count of all unique words
        kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
@ -2173,7 +2176,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        try {
            // get set of words
            // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
-            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
+            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getTextBytes());
            // delete all word references
            int count = removeReferences(urlhash, witer);
            // finally delete the url entry itself