*) odtParser: better handling of large files

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2702 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 1586d57187
parent f17ce28b6d
commit 1586d57187
1 changed files with 66 additions and 33 deletions
--- a/source/de/anomic/plasma/parser/odt/odtParser.java
+++ b/source/de/anomic/plasma/parser/odt/odtParser.java
@ -45,7 +45,10 @@ package de.anomic.plasma.parser.odt;

 import java.io.ByteArrayInputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.InputStream;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
 import java.util.Enumeration;
 import java.util.Hashtable;
 import java.util.zip.ZipEntry;
@ -61,6 +64,7 @@ import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.parser.AbstractParser;
 import de.anomic.plasma.parser.Parser;
 import de.anomic.plasma.parser.ParserException;
+import de.anomic.server.serverCharBuffer;
 import de.anomic.server.serverFileUtils;
 import de.anomic.server.logging.serverLog;

@ -93,8 +97,9 @@ public class odtParser extends AbstractParser implements Parser {
    
    public plasmaParserDocument parse(URL location, String mimeType, String charset, File dest) throws ParserException, InterruptedException {
        
+        Writer writer = null;
+        File writerFile = null;
        try {          
-            byte[] docContent     = null;
            String docDescription = null;
            String docKeywordStr    = null;
            String docShortTitle  = null;
@ -115,12 +120,27 @@ public class odtParser extends AbstractParser implements Parser {
                
                // content.xml contains the document content in xml format
                if (entryName.equals("content.xml")) {
+                    long contentSize = zipEntry.getSize();
+                    
+                    // creating a writer for output
+                    if ((contentSize == -1) || (contentSize > Parser.MAX_KEEP_IN_MEMORY_SIZE)) {
+                        writerFile = File.createTempFile("odtParser",".tmp");
+                        writer = new OutputStreamWriter(new FileOutputStream(writerFile),"UTF-8");
+                    } else {
+                        writer = new serverCharBuffer(); 
+                    }                    
+                    
+                    // extract data
                    InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
                    OpenDocumentTextInputStream odStream = new OpenDocumentTextInputStream(zipFileEntryStream);
-                    docContent = serverFileUtils.read(odStream); 
+                    serverFileUtils.copy(odStream, writer, "UTF-8");
+                
+                    // close readers and writers
+                    odStream.close();
+                    writer.close();
                    
-                // meta.xml contains metadata about the document
                } else if (entryName.equals("meta.xml")) {
+                    //  meta.xml contains metadata about the document
                    InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
                    ODFMetaFileAnalyzer metaAnalyzer = new ODFMetaFileAnalyzer();
                    OpenDocumentMetadata metaData = metaAnalyzer.analyzeMetaData(zipFileEntryStream);
@ -128,24 +148,13 @@ public class odtParser extends AbstractParser implements Parser {
                    docKeywordStr    = metaData.getKeyword();
                    docShortTitle  = metaData.getTitle();
                    docLongTitle   = metaData.getSubject();
+                }
+            }
            
            // if there is no title availabe we generate one
            if (docLongTitle == null) {
                if (docShortTitle != null) {
                    docLongTitle = docShortTitle;
-                        } else if (docContent != null && docContent.length <= 80) {
-                            docLongTitle = new String(docContent, "UTF-8");
-                        } else {
-                            byte[] title = new byte[80];
-                            System.arraycopy(docContent, 0, title, 0, 80);
-                            docLongTitle = new String(title, "UTF-8");
-                        }
-                        docLongTitle.
-                        replaceAll("\r\n"," ").
-                        replaceAll("\n"," ").
-                        replaceAll("\r"," ").
-                        replaceAll("\t"," ");
-                    }
                } 
            }            
         
@ -154,7 +163,23 @@ public class odtParser extends AbstractParser implements Parser {
            if (docKeywordStr != null) docKeywords = docKeywordStr.split(" |,");
            
            // create the parser document
-            return new plasmaParserDocument(
+            plasmaParserDocument theDoc = null;
+            if (writer instanceof serverCharBuffer) {
+                byte[] contentBytes = ((serverCharBuffer)writer).toString().getBytes("UTF-8");
+                theDoc = new plasmaParserDocument(
+                        location,
+                        mimeType,
+                        "UTF-8",
+                        docKeywords,
+                        docShortTitle, 
+                        docLongTitle,
+                        null,
+                        docDescription,
+                        contentBytes,
+                        null,
+                        null);
+            } else {
+                theDoc = new plasmaParserDocument(
                        location,
                        mimeType,
                        "UTF-8",
@ -163,13 +188,21 @@ public class odtParser extends AbstractParser implements Parser {
                        docLongTitle,
                        null,
                        docDescription,
-                    docContent,
+                        writerFile,
                        null,
                        null);
+            }
+            return theDoc;
        } catch (Exception e) {            
            if (e instanceof InterruptedException) throw (InterruptedException) e;
            if (e instanceof ParserException) throw (ParserException) e;
            
+            // close the writer
+            if (writer != null) try { writer.close(); } catch (Exception ex) {/* ignore this */}
+            
+            // delete the file
+            if (writerFile != null) try { writerFile.delete(); } catch (Exception ex)  {/* ignore this */}            
+            
            throw new ParserException("Unexpected error while parsing odt file. " + e.getMessage(),location); 
        }
    }