From 5ba5fb5d23c98992fc0b47279ea1e4a14cc434c6 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 27 Oct 2023 12:05:24 +0200 Subject: [PATCH] upgraded pdfbox to 3.0.0 --- ivy.xml | 2 +- source/net/yacy/cora/util/Html2Image.java | 13 ++-- .../net/yacy/document/parser/pdfParser.java | 74 ++++++++++--------- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/ivy.xml b/ivy.xml index 26fe7504d..c5f13b585 100644 --- a/ivy.xml +++ b/ivy.xml @@ -55,7 +55,7 @@ - + diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index 10bea0e2b..e10bca89e 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -43,6 +43,7 @@ import javax.swing.text.html.HTMLDocument; import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.ImageView; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; @@ -211,7 +212,7 @@ public class Html2Image { * call termination. Beyond this limit the process is killed. * @return true when the destination file was successfully written */ - public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) { + public static boolean writeWkhtmltopdf(final String url, final String proxy, final String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) { boolean success = false; for (final boolean ignoreErrors: new boolean[]{false, true}) { success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds); @@ -352,7 +353,7 @@ public class Html2Image { // convert pdf to jpg using internal pdfbox capability if (convertCmd == null) { - try (final PDDocument pdoc = PDDocument.load(pdf);) { + try (final PDDocument pdoc = Loader.loadPDF(pdf);) { final BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB); @@ -432,7 +433,7 @@ public class Html2Image { * @param size * @throws IOException */ - public static void writeSwingImage(String url, Dimension size, File destination) throws IOException { + public static void writeSwingImage(final String url, final Dimension size, final File destination) throws IOException { // set up a pane for rendering final JEditorPane htmlPane = new JEditorPane(); @@ -453,7 +454,7 @@ public class Html2Image { public ViewFactory getViewFactory() { return new HTMLFactory() { @Override - public View create(Element elem) { + public View create(final Element elem) { final View view = super.create(elem); if (view instanceof ImageView) { ((ImageView) view).setLoadsSynchronously(true); @@ -467,7 +468,7 @@ public class Html2Image { htmlPane.setContentType("text/html"); htmlPane.addPropertyChangeListener(new PropertyChangeListener() { @Override - public void propertyChange(PropertyChangeEvent evt) { + public void propertyChange(final PropertyChangeEvent evt) { } }); @@ -501,7 +502,7 @@ public class Html2Image { * * */ - public static void main(String[] args) { + public static void main(final String[] args) { final String usageMessage = "Usage : java " + Html2Image.class.getName() + " [wkhtmltopdf|swing]"; int exitStatus = 0; diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index bd3c3adfb..0ad6b2248 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -39,8 +39,10 @@ import java.util.Date; import java.util.HashSet; import java.util.List; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.io.MemoryUsageSetting; +import org.apache.pdfbox.io.RandomAccessRead; +import org.apache.pdfbox.io.RandomAccessReadBuffer; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDPage; @@ -69,7 +71,7 @@ public class pdfParser extends AbstractParser implements Parser { public static boolean individualPages = false; public static String individualPagePropertyname = "page"; - + public pdfParser() { super("Acrobat Portable Document Parser"); this.SUPPORTED_EXTENSIONS.add("pdf"); @@ -86,7 +88,7 @@ public class pdfParser extends AbstractParser implements Parser { final DigestURL location, final String mimeType, final String charset, - final VocabularyScraper scraper, + final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -98,8 +100,8 @@ public class pdfParser extends AbstractParser implements Parser { PDDocument pdfDoc; try { Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain - MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200*1024*1024); - pdfDoc = PDDocument.load(source, mus); + final RandomAccessRead readBuffer = new RandomAccessReadBuffer(source); + pdfDoc = Loader.loadPDF(readBuffer); } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } finally { @@ -141,34 +143,34 @@ public class pdfParser extends AbstractParser implements Parser { if (docKeywordStr != null) { docKeywords = docKeywordStr.split(" |,"); } - + Document[] result = null; try { // get the links final List> pdflinks = extractPdfLinks(pdfDoc); - + // get the fulltext (either per document or for each page) final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); if (individualPages) { // this is a hack which stores individual pages of the source pdf into individual index documents // the new documents will get a virtual link with a post argument page=X appended to the original url - + // collect text - int pagecount = pdfDoc.getNumberOfPages(); - String[] pages = new String[pagecount]; + final int pagecount = pdfDoc.getNumberOfPages(); + final String[] pages = new String[pagecount]; for (int page = 1; page <= pagecount; page++) { stripper.setStartPage(page); stripper.setEndPage(page); pages[page - 1] = stripper.getText(pdfDoc); //System.out.println("PAGE " + page + ": " + pages[page - 1]); } - + // create individual documents for each page assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); result = new Document[Math.min(pages.length, pdflinks.size())]; - String loc = location.toNormalform(true); - for (int page = 0; page < result.length; page++) { + final String loc = location.toNormalform(true); + for (int page = 0; page < result.length; page++) { result[page] = new Document( new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash mimeType, @@ -216,9 +218,9 @@ public class pdfParser extends AbstractParser implements Parser { contentBytes = writer.getBytes(); // get final text before closing writer writer.close(); // free writer resources } - - Collection pdflinksCombined = new HashSet(); - for (Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); + + final Collection pdflinksCombined = new HashSet<>(); + for (final Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[]{new Document( location, mimeType, @@ -238,7 +240,7 @@ public class pdfParser extends AbstractParser implements Parser { null, false, docDate)}; - } + } } catch (final Throwable e) { //throw new Parser.Failure(e.getMessage(), location); } finally { @@ -248,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser { // clear cached resources in pdfbox. pdfDoc = null; clearPdfBoxCaches(); - + return result; } @@ -258,25 +260,25 @@ public class pdfParser extends AbstractParser implements Parser { * @return all detected links */ private List> extractPdfLinks(final PDDocument pdf) { - List> linkCollections = new ArrayList<>(pdf.getNumberOfPages()); - for (PDPage page : pdf.getPages()) { - final Collection pdflinks = new ArrayList(); + final List> linkCollections = new ArrayList<>(pdf.getNumberOfPages()); + for (final PDPage page : pdf.getPages()) { + final Collection pdflinks = new ArrayList<>(); try { - List annotations = page.getAnnotations(); + final List annotations = page.getAnnotations(); if (annotations != null) { - for (PDAnnotation pdfannotation : annotations) { + for (final PDAnnotation pdfannotation : annotations) { if (pdfannotation instanceof PDAnnotationLink) { - PDAction link = ((PDAnnotationLink)pdfannotation).getAction(); + final PDAction link = ((PDAnnotationLink)pdfannotation).getAction(); if (link != null && link instanceof PDActionURI) { - PDActionURI pdflinkuri = (PDActionURI) link; - String uristr = pdflinkuri.getURI(); - AnchorURL url = new AnchorURL(uristr); + final PDActionURI pdflinkuri = (PDActionURI) link; + final String uristr = pdflinkuri.getURI(); + final AnchorURL url = new AnchorURL(uristr); pdflinks.add(url); } } } } - } catch (IOException ex) {} + } catch (final IOException ex) {} linkCollections.add(pdflinks); } return linkCollections; @@ -292,17 +294,17 @@ public class pdfParser extends AbstractParser implements Parser { * situation is now from far better, but one (unnecessary?) cache structure in * the COSName class still needs to be explicitely cleared. */ - + // History of related issues : // http://markmail.org/thread/quk5odee4hbsauhu - // https://issues.apache.org/jira/browse/PDFBOX-313 + // https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-441 // https://issues.apache.org/jira/browse/PDFBOX-2200 // https://issues.apache.org/jira/browse/PDFBOX-2149 - + COSName.clearResources(); - + /* * Prior to PDFBox 2.0.0, clearResources() function had to be called on the * org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version @@ -327,7 +329,7 @@ public class pdfParser extends AbstractParser implements Parser { // parse final AbstractParser parser = new pdfParser(); Document document = null; - FileInputStream inStream = null; + FileInputStream inStream = null; try { inStream = new FileInputStream(pdfFile); document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream)); @@ -345,7 +347,7 @@ public class pdfParser extends AbstractParser implements Parser { if(inStream != null) { try { inStream.close(); - } catch(IOException e) { + } catch(final IOException e) { System.err.println("Could not close input stream on file " + pdfFile); } } @@ -359,7 +361,7 @@ public class pdfParser extends AbstractParser implements Parser { System.out.println("\t!!!Parsing without result!!!"); } else { System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); - InputStream textStream = document.getTextStream(); + final InputStream textStream = document.getTextStream(); try { // write file FileUtils.copy(textStream, new File("parsedPdf.txt")); @@ -372,7 +374,7 @@ public class pdfParser extends AbstractParser implements Parser { /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ textStream.close(); } - } catch (IOException e) { + } catch (final IOException e) { ConcurrentLog.warn("PDFPARSER", "Could not close text input stream"); } }