upgraded pdfbox to 3.0.0

1 year ago · 5ba5fb5d23
parent c10944bd4a
commit 5ba5fb5d23
3 changed files with 46 additions and 43 deletions
--- a/ivy.xml
+++ b/ivy.xml
@ -55,7 +55,7 @@
      <dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2" conf="compile->master"/>
      <dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2" conf="compile->master"/>
      <dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2"/>
-      <dependency org="org.apache.pdfbox" name="pdfbox" rev="2.0.29"  />
+      <dependency org="org.apache.pdfbox" name="pdfbox" rev="3.0.0"  />
      <dependency org="org.apache.poi" name="poi" rev="3.17"  />
      <dependency org="org.apache.poi" name="poi-scratchpad" rev="3.17"  />
      <dependency org="org.apache.solr" name="solr-core" rev="8.11.2" conf="compile->master"/>
--- a/source/net/yacy/cora/util/Html2Image.java
+++ b/source/net/yacy/cora/util/Html2Image.java
@ -43,6 +43,7 @@ import javax.swing.text.html.HTMLDocument;
 import javax.swing.text.html.HTMLEditorKit;
 import javax.swing.text.html.ImageView;

+import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.rendering.PDFRenderer;
@ -211,7 +212,7 @@ public class Html2Image {
     *                    call termination. Beyond this limit the process is killed.
     * @return true when the destination file was successfully written
     */
-    public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
+    public static boolean writeWkhtmltopdf(final String url, final String proxy, final String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
        boolean success = false;
        for (final boolean ignoreErrors: new boolean[]{false, true}) {
            success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
@ -352,7 +353,7 @@ public class Html2Image {

        // convert pdf to jpg using internal pdfbox capability
        if (convertCmd == null) {
-            try (final PDDocument pdoc = PDDocument.load(pdf);) {
+            try (final PDDocument pdoc = Loader.loadPDF(pdf);) {

                final BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);

@ -432,7 +433,7 @@ public class Html2Image {
     * @param size
     * @throws IOException
     */
-    public static void writeSwingImage(String url, Dimension size, File destination) throws IOException {
+    public static void writeSwingImage(final String url, final Dimension size, final File destination) throws IOException {

        // set up a pane for rendering
        final JEditorPane htmlPane = new JEditorPane();
@ -453,7 +454,7 @@ public class Html2Image {
            public ViewFactory getViewFactory() {
                return new HTMLFactory() {
                    @Override
-                    public View create(Element elem) {
+                    public View create(final Element elem) {
                        final View view = super.create(elem);
                        if (view instanceof ImageView) {
                            ((ImageView) view).setLoadsSynchronously(true);
@ -467,7 +468,7 @@ public class Html2Image {
        htmlPane.setContentType("text/html");
        htmlPane.addPropertyChangeListener(new PropertyChangeListener() {
            @Override
-            public void propertyChange(PropertyChangeEvent evt) {
+            public void propertyChange(final PropertyChangeEvent evt) {
            }
        });

@ -501,7 +502,7 @@ public class Html2Image {
     * 	</li>
     * </ol>
     */
-    public static void main(String[] args) {
+    public static void main(final String[] args) {
        final String usageMessage = "Usage : java " + Html2Image.class.getName()
                + " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]";
        int exitStatus = 0;
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -39,8 +39,10 @@ import java.util.Date;
 import java.util.HashSet;
 import java.util.List;

+import org.apache.pdfbox.Loader;
 import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.io.MemoryUsageSetting;
+import org.apache.pdfbox.io.RandomAccessRead;
+import org.apache.pdfbox.io.RandomAccessReadBuffer;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
@ -69,7 +71,7 @@ public class pdfParser extends AbstractParser implements Parser {

    public static boolean individualPages = false;
    public static String individualPagePropertyname = "page";
-    
+
    public pdfParser() {
        super("Acrobat Portable Document Parser");
        this.SUPPORTED_EXTENSIONS.add("pdf");
@ -86,7 +88,7 @@ public class pdfParser extends AbstractParser implements Parser {
            final DigestURL location,
            final String mimeType,
            final String charset,
-            final VocabularyScraper scraper, 
+            final VocabularyScraper scraper,
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {

@ -98,8 +100,8 @@ public class pdfParser extends AbstractParser implements Parser {
        PDDocument pdfDoc;
        try {
            Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
-            MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200*1024*1024);
-            pdfDoc = PDDocument.load(source, mus);
+            final RandomAccessRead readBuffer = new RandomAccessReadBuffer(source);
+            pdfDoc = Loader.loadPDF(readBuffer);
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        } finally {
@ -141,34 +143,34 @@ public class pdfParser extends AbstractParser implements Parser {
        if (docKeywordStr != null) {
            docKeywords = docKeywordStr.split(" |,");
        }
-        
+
        Document[] result = null;
        try {
            // get the links
        	final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
-            
+
            // get the fulltext (either per document or for each page)
            final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);

            if (individualPages) {
                // this is a hack which stores individual pages of the source pdf into individual index documents
                // the new documents will get a virtual link with a post argument page=X appended to the original url
-                
+
                // collect text
-                int pagecount = pdfDoc.getNumberOfPages();
-                String[] pages = new String[pagecount];
+                final int pagecount = pdfDoc.getNumberOfPages();
+                final String[] pages = new String[pagecount];
                for (int page = 1; page <= pagecount; page++) {
                    stripper.setStartPage(page);
                    stripper.setEndPage(page);
                    pages[page - 1] = stripper.getText(pdfDoc);
                    //System.out.println("PAGE " + page + ": " + pages[page - 1]);
                }
-                
+
                // create individual documents for each page
                assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
                result = new Document[Math.min(pages.length, pdflinks.size())];
-                String loc = location.toNormalform(true);
-                for (int page = 0; page < result.length; page++) {                    
+                final String loc = location.toNormalform(true);
+                for (int page = 0; page < result.length; page++) {
                    result[page] = new Document(
                            new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
                            mimeType,
@ -216,9 +218,9 @@ public class pdfParser extends AbstractParser implements Parser {
                    contentBytes = writer.getBytes(); // get final text before closing writer
                    writer.close(); // free writer resources
                }
-                
-                Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
-                for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
+
+                final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
+                for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
                result = new Document[]{new Document(
                        location,
                        mimeType,
@ -238,7 +240,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        null,
                        false,
                        docDate)};
-            }         
+            }
        } catch (final Throwable e) {
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
@ -248,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
        // clear cached resources in pdfbox.
        pdfDoc = null;
        clearPdfBoxCaches();
-        
+
        return result;
    }

@ -258,25 +260,25 @@ public class pdfParser extends AbstractParser implements Parser {
     * @return all detected links
     */
    private List<Collection<AnchorURL>> extractPdfLinks(final PDDocument pdf) {
-        List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages());
-        for (PDPage page : pdf.getPages()) {
-            final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
+        final List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages());
+        for (final PDPage page : pdf.getPages()) {
+            final Collection<AnchorURL> pdflinks = new ArrayList<>();
            try {
-                List<PDAnnotation> annotations = page.getAnnotations();
+                final List<PDAnnotation> annotations = page.getAnnotations();
                if (annotations != null) {
-                    for (PDAnnotation pdfannotation : annotations) {
+                    for (final PDAnnotation pdfannotation : annotations) {
                        if (pdfannotation instanceof PDAnnotationLink) {
-                            PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
+                            final PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
                            if (link != null && link instanceof PDActionURI) {
-                                PDActionURI pdflinkuri = (PDActionURI) link;
-                                String uristr = pdflinkuri.getURI();
-                                AnchorURL url = new AnchorURL(uristr);
+                                final PDActionURI pdflinkuri = (PDActionURI) link;
+                                final String uristr = pdflinkuri.getURI();
+                                final AnchorURL url = new AnchorURL(uristr);
                                pdflinks.add(url);
                            }
                        }
                    }
                }
-            } catch (IOException ex) {}
+            } catch (final IOException ex) {}
            linkCollections.add(pdflinks);
        }
        return linkCollections;
@ -292,17 +294,17 @@ public class pdfParser extends AbstractParser implements Parser {
 		 * situation is now from far better, but one (unnecessary?) cache structure in
 		 * the COSName class still needs to be explicitely cleared.
 		 */
-    	
+
 		// History of related issues :
    	// http://markmail.org/thread/quk5odee4hbsauhu
-		// https://issues.apache.org/jira/browse/PDFBOX-313 
+		// https://issues.apache.org/jira/browse/PDFBOX-313
 		// https://issues.apache.org/jira/browse/PDFBOX-351
 		// https://issues.apache.org/jira/browse/PDFBOX-441
    	// https://issues.apache.org/jira/browse/PDFBOX-2200
    	// https://issues.apache.org/jira/browse/PDFBOX-2149
-    	
+
        COSName.clearResources();
-        
+
 		/*
 		 * Prior to PDFBox 2.0.0, clearResources() function had to be called on the
 		 * org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
@ -327,7 +329,7 @@ public class pdfParser extends AbstractParser implements Parser {
                // parse
                final AbstractParser parser = new pdfParser();
                Document document = null;
-                FileInputStream inStream = null; 
+                FileInputStream inStream = null;
                try {
                	inStream = new FileInputStream(pdfFile);
                    document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream));
@ -345,7 +347,7 @@ public class pdfParser extends AbstractParser implements Parser {
                	if(inStream != null) {
                		try {
                			inStream.close();
-                		} catch(IOException e) {
+                		} catch(final IOException e) {
                			System.err.println("Could not close input stream on file " + pdfFile);
                		}
                	}
@ -359,7 +361,7 @@ public class pdfParser extends AbstractParser implements Parser {
                    System.out.println("\t!!!Parsing without result!!!");
                } else {
                    System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
-                    InputStream textStream = document.getTextStream();
+                    final InputStream textStream = document.getTextStream();
                    try {
                        // write file
                        FileUtils.copy(textStream, new File("parsedPdf.txt"));
@ -372,7 +374,7 @@ public class pdfParser extends AbstractParser implements Parser {
                        		/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
                        		textStream.close();
                        	}
-						} catch (IOException e) {
+						} catch (final IOException e) {
 							ConcurrentLog.warn("PDFPARSER", "Could not close text input stream");
 						}
                    }