From 54fbe166ba41dac90f571e4072ee227d1ef23973 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 16 Aug 2018 18:23:42 +0200 Subject: [PATCH] Updated pdf cache clear steps consistently with current pdfbox version - Removed calls to no more existing clearResources functions (on PDFont class and its children) since upgrade to pdfbox 2.n.n - Removed hacky usage of protected internal ClassLoader function. This removes the warnings displayed when running with JDK9 or JDK10 : [java] WARNING: Illegal reflective access by net.yacy.document.parser.pdfParser$ResourceCleaner (file:) to method java.lang.ClassLoader.findLoadedClass(java.lang.String) [java] WARNING: Please consider reporting this to the maintainers of net.yacy.document.parser.pdfParser$ResourceCleaner [java] WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations [java] WARNING: All illegal access operations will be denied in a future release Crawling thousands of pdf documents from various sources after modifications applied, revealed no new memory leak related to pdfbox (measurements done with JVisualVM). --- .../net/yacy/document/parser/pdfParser.java | 93 ++++++------------- source/net/yacy/search/Switchboard.java | 2 +- 2 files changed, 31 insertions(+), 64 deletions(-) diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index ef3587646..bd3c3adfb 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -32,7 +32,6 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.lang.reflect.Method; import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collection; @@ -40,6 +39,7 @@ import java.util.Date; import java.util.HashSet; import java.util.List; +import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; @@ -81,10 +81,6 @@ public class pdfParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("text/x-pdf"); } - static { - clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise; - } - @Override public Document[] parse( final DigestURL location, @@ -249,18 +245,9 @@ public class pdfParser extends AbstractParser implements Parser { try {pdfDoc.close();} catch (final Throwable e) {} } - // clear resources in pdfbox. they say that is resolved but it's not. see: - // https://issues.apache.org/jira/browse/PDFBOX-313 - // https://issues.apache.org/jira/browse/PDFBOX-351 - // https://issues.apache.org/jira/browse/PDFBOX-441 - // the pdfbox still generates enormeous number of object allocations and don't delete these - // the following Object are statically stored and never flushed: - // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary, - // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull - // the great number of these objects can easily be seen in Java Visual VM - // we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out. + // clear cached resources in pdfbox. pdfDoc = null; - clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); + clearPdfBoxCaches(); return result; } @@ -295,55 +282,35 @@ public class pdfParser extends AbstractParser implements Parser { return linkCollections; } - public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { - // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever - // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! - // Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain. - ResourceCleaner cl = new ResourceCleaner(); - cl.clearClassResources("org.apache.pdfbox.cos.COSName"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font"); - cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont"); + /** + * Clean up cache resources allocated by PDFBox that would otherwise not be released. + */ + public static void clearPdfBoxCaches() { + /* + * Prior to pdfbox 2.0.0 font cache occupied > 80MB RAM for a single pdf and + * then stayed forever (detected in YaCy with pdfbox version 1.2.1). The + * situation is now from far better, but one (unnecessary?) cache structure in + * the COSName class still needs to be explicitely cleared. + */ + + // History of related issues : + // http://markmail.org/thread/quk5odee4hbsauhu + // https://issues.apache.org/jira/browse/PDFBOX-313 + // https://issues.apache.org/jira/browse/PDFBOX-351 + // https://issues.apache.org/jira/browse/PDFBOX-441 + // https://issues.apache.org/jira/browse/PDFBOX-2200 + // https://issues.apache.org/jira/browse/PDFBOX-2149 + + COSName.clearResources(); + + /* + * Prior to PDFBox 2.0.0, clearResources() function had to be called on the + * org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version + * 2.0.0, there is no more such a function in PDFont class as font cache is + * handled differently and hopefully more properly. + */ } - @SuppressWarnings({ "unchecked", "rawtypes" }) - private static class ResourceCleaner { - Method findLoadedClass; - private ClassLoader sys; - public ResourceCleaner() { - try { - this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class }); - this.findLoadedClass.setAccessible(true); - this.sys = ClassLoader.getSystemClassLoader(); - } catch (Throwable e) { - e.printStackTrace(); - this.findLoadedClass = null; - this.sys = null; - } - } - public void clearClassResources(String name) { - if (this.findLoadedClass == null) return; - try { - Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name); - if (pdfparserpainclass != null) { - Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {}); - if (clearResources != null) clearResources.invoke(null); - } - } catch (Throwable e) { - //e.printStackTrace(); - } - } - } - /** * test * @param args diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 93b4e4b7c..19b6223cf 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2454,7 +2454,7 @@ public final class Switchboard extends serverSwitch { public static void clearCaches() { // flush caches in used libraries - pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu + pdfParser.clearPdfBoxCaches(); // clear caches if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();