getting the trash out

12 years ago · 5344a1c5f7
parent 709e9b8ce7
commit 5344a1c5f7
3 changed files with 50 additions and 8 deletions
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -40,7 +40,18 @@ import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
+import org.apache.pdfbox.pdmodel.font.PDCIDFont;
+import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font;
+import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font;
 import org.apache.pdfbox.pdmodel.font.PDFont;
+import org.apache.pdfbox.pdmodel.font.PDMMType1Font;
+import org.apache.pdfbox.pdmodel.font.PDSimpleFont;
+import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont;
+import org.apache.pdfbox.pdmodel.font.PDType0Font;
+import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont;
+import org.apache.pdfbox.pdmodel.font.PDType1CFont;
+import org.apache.pdfbox.pdmodel.font.PDType1Font;
+import org.apache.pdfbox.pdmodel.font.PDType3Font;
 import org.apache.pdfbox.util.PDFTextStripper;

 import net.yacy.cora.document.MultiProtocolURI;
@ -75,12 +86,12 @@ public class pdfParser extends AbstractParser implements Parser {
            throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

        // create a pdf parser
-        final PDDocument pdfDoc;
+        PDDocument pdfDoc;
        //final PDFParser pdfParser;
        try {
-            Thread.currentThread().setPriority(Thread.MIN_PRIORITY);
+            Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
            pdfDoc = PDDocument.load(source);
-            //pdfParser = new PDFParser(source);
+            //PDFParser pdfParser = new PDFParser(source);
            //pdfParser.parse();
            //pdfDoc = pdfParser.getPDDocument();
        } catch (final IOException e) {
@ -108,7 +119,7 @@ public class pdfParser extends AbstractParser implements Parser {
        }

        // extracting some metadata
-        final PDDocumentInformation info = pdfDoc.getDocumentInformation();
+        PDDocumentInformation info = pdfDoc.getDocumentInformation();
        String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
        if (info != null) {
            docTitle = info.getTitle();
@ -122,6 +133,7 @@ public class pdfParser extends AbstractParser implements Parser {
            // info.getCreationDate());
            // info.getModificationDate();
        }
+        info = null;

        if (docTitle == null || docTitle.isEmpty()) {
            docTitle = MultiProtocolURI.unescape(location.getFileName());
@ -139,12 +151,13 @@ public class pdfParser extends AbstractParser implements Parser {
            stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
            stripper.setEndPage(Integer.MAX_VALUE); // set to default
            // we start the pdf parsing in a separate thread to ensure that it can be terminated
+            final PDDocument pdfDocC = pdfDoc;
            final Thread t = new Thread() {
                @Override
                public void run() {
                    Thread.currentThread().setName("pdfParser.getText:" + location);
                    try {
-                        writer.append(stripper.getText(pdfDoc));
+                        writer.append(stripper.getText(pdfDocC));
                    } catch (final Throwable e) {}
                }
            };
@ -181,8 +194,9 @@ public class pdfParser extends AbstractParser implements Parser {
        // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
        // the great number of these objects can easily be seen in Java Visual VM
        // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
-        COSName.clearResources();
-        PDFont.clearResources();
+        pdfDoc = null;
+        clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();
+        
        return new Document[]{new Document(
                location,
                mimeType,
@ -203,6 +217,26 @@ public class pdfParser extends AbstractParser implements Parser {
                false)};
    }

+    @SuppressWarnings("static-access")
+    public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
+        // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
+        // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
+        // Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain.
+        PDFont.clearResources();
+        COSName.clearResources();
+        PDType1Font.clearResources();
+        PDTrueTypeFont.clearResources();
+        PDType0Font.clearResources();
+        PDType1AfmPfbFont.clearResources();
+        PDType3Font.clearResources();
+        PDType1CFont.clearResources();
+        PDCIDFont.clearResources();
+        PDCIDFontType0Font.clearResources();
+        PDCIDFontType2Font.clearResources();
+        PDMMType1Font.clearResources();
+        PDSimpleFont.clearResources();
+    }
+    
    /**
     * test
     * @param args
--- a/source/net/yacy/kelondro/data/word/Word.java
+++ b/source/net/yacy/kelondro/data/word/Word.java
@ -29,6 +29,7 @@ package net.yacy.kelondro.data.word;
 import java.util.Collection;
 import java.util.Locale;

+import net.yacy.cora.document.WordCache;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.order.Digest;
 import net.yacy.cora.storage.ARC;
@ -86,6 +87,10 @@ public class Word {
        this.flags = null;
    }

+    public static void clearCache() {
+        hashCache.clear();
+    }
+    
    public void inc() {
        this.count++;
    }
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -81,6 +81,7 @@ import java.util.zip.GZIPOutputStream;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;

+import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
@ -151,6 +152,7 @@ import net.yacy.document.content.DCEntry;
 import net.yacy.document.content.SurrogateReader;
 import net.yacy.document.importer.OAIListFriendsLoader;
 import net.yacy.document.parser.audioTagParser;
+import net.yacy.document.parser.pdfParser;
 import net.yacy.document.parser.html.Evaluation;
 import net.yacy.gui.Tray;
 import net.yacy.kelondro.blob.Tables;
@ -2035,10 +2037,11 @@ public final class Switchboard extends serverSwitch {
        
        try {
            // flush caches in used libraries
-            PDFont.clearResources(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
+            pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
            
            // clear caches
            if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
+            Word.clearCache();
            Domains.clear();
            
            // clean up image stack