Updated pdf cache clear steps consistently with current pdfbox version

- Removed calls to no more existing clearResources functions (on PDFont
class and its children) since upgrade to pdfbox 2.n.n
- Removed hacky usage of protected internal ClassLoader function. This
removes the warnings displayed when running with JDK9 or JDK10 :

     [java] WARNING: Illegal reflective access by
net.yacy.document.parser.pdfParser$ResourceCleaner (file:<path>) to
method java.lang.ClassLoader.findLoadedClass(java.lang.String)
     [java] WARNING: Please consider reporting this to the maintainers
of net.yacy.document.parser.pdfParser$ResourceCleaner
     [java] WARNING: Use --illegal-access=warn to enable warnings of
further illegal reflective access operations
     [java] WARNING: All illegal access operations will be denied in a
future release

Crawling thousands of pdf documents from various sources after
modifications applied, revealed no new memory leak related to pdfbox
(measurements done with JVisualVM).
pull/220/head
luccioman 7 years ago
parent 685122363d
commit 54fbe166ba

@ -32,7 +32,6 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.lang.reflect.Method;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
@ -40,6 +39,7 @@ import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.io.MemoryUsageSetting;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
@ -81,10 +81,6 @@ public class pdfParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("text/x-pdf"); this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
} }
static {
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // must be called here to get that into the class loader; it will block other threads otherwise;
}
@Override @Override
public Document[] parse( public Document[] parse(
final DigestURL location, final DigestURL location,
@ -249,18 +245,9 @@ public class pdfParser extends AbstractParser implements Parser {
try {pdfDoc.close();} catch (final Throwable e) {} try {pdfDoc.close();} catch (final Throwable e) {}
} }
// clear resources in pdfbox. they say that is resolved but it's not. see: // clear cached resources in pdfbox.
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441
// the pdfbox still generates enormeous number of object allocations and don't delete these
// the following Object are statically stored and never flushed:
// COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
// COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
// the great number of these objects can easily be seen in Java Visual VM
// we try to get this shit out of the memory here by forced clear calls, hope the best the rubbish gets out.
pdfDoc = null; pdfDoc = null;
clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); clearPdfBoxCaches();
return result; return result;
} }
@ -295,55 +282,35 @@ public class pdfParser extends AbstractParser implements Parser {
return linkCollections; return linkCollections;
} }
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { /**
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever * Clean up cache resources allocated by PDFBox that would otherwise not be released.
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! */
// Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain. public static void clearPdfBoxCaches() {
ResourceCleaner cl = new ResourceCleaner(); /*
cl.clearClassResources("org.apache.pdfbox.cos.COSName"); * Prior to pdfbox 2.0.0 font cache occupied > 80MB RAM for a single pdf and
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont"); * then stayed forever (detected in YaCy with pdfbox version 1.2.1). The
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font"); * situation is now from far better, but one (unnecessary?) cache structure in
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont"); * the COSName class still needs to be explicitely cleared.
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font"); */
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont");
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font"); // History of related issues :
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont"); // http://markmail.org/thread/quk5odee4hbsauhu
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont"); // https://issues.apache.org/jira/browse/PDFBOX-313
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font"); // https://issues.apache.org/jira/browse/PDFBOX-351
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font"); // https://issues.apache.org/jira/browse/PDFBOX-441
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font"); // https://issues.apache.org/jira/browse/PDFBOX-2200
cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont"); // https://issues.apache.org/jira/browse/PDFBOX-2149
COSName.clearResources();
/*
* Prior to PDFBox 2.0.0, clearResources() function had to be called on the
* org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
* 2.0.0, there is no more such a function in PDFont class as font cache is
* handled differently and hopefully more properly.
*/
} }
@SuppressWarnings({ "unchecked", "rawtypes" })
private static class ResourceCleaner {
Method findLoadedClass;
private ClassLoader sys;
public ResourceCleaner() {
try {
this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class });
this.findLoadedClass.setAccessible(true);
this.sys = ClassLoader.getSystemClassLoader();
} catch (Throwable e) {
e.printStackTrace();
this.findLoadedClass = null;
this.sys = null;
}
}
public void clearClassResources(String name) {
if (this.findLoadedClass == null) return;
try {
Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name);
if (pdfparserpainclass != null) {
Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {});
if (clearResources != null) clearResources.invoke(null);
}
} catch (Throwable e) {
//e.printStackTrace();
}
}
}
/** /**
* test * test
* @param args * @param args

@ -2454,7 +2454,7 @@ public final class Switchboard extends serverSwitch {
public static void clearCaches() { public static void clearCaches() {
// flush caches in used libraries // flush caches in used libraries
pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu pdfParser.clearPdfBoxCaches();
// clear caches // clear caches
if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();

Loading…
Cancel
Save