upgraded pdfbox to 3.0.0

pull/607/head
Michael Peter Christen 1 year ago
parent c10944bd4a
commit 5ba5fb5d23

@ -55,7 +55,7 @@
<dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2" conf="compile->master"/> <dependency org="org.apache.lucene" name="lucene-queryparser" rev="8.11.2" conf="compile->master"/>
<dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2" conf="compile->master"/> <dependency org="org.apache.lucene" name="lucene-spatial-extras" rev="8.11.2" conf="compile->master"/>
<dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2"/> <dependency org="org.apache.lucene" name="lucene-suggest" rev="8.11.2"/>
<dependency org="org.apache.pdfbox" name="pdfbox" rev="2.0.29" /> <dependency org="org.apache.pdfbox" name="pdfbox" rev="3.0.0" />
<dependency org="org.apache.poi" name="poi" rev="3.17" /> <dependency org="org.apache.poi" name="poi" rev="3.17" />
<dependency org="org.apache.poi" name="poi-scratchpad" rev="3.17" /> <dependency org="org.apache.poi" name="poi-scratchpad" rev="3.17" />
<dependency org="org.apache.solr" name="solr-core" rev="8.11.2" conf="compile->master"/> <dependency org="org.apache.solr" name="solr-core" rev="8.11.2" conf="compile->master"/>

@ -43,6 +43,7 @@ import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit; import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.ImageView; import javax.swing.text.html.ImageView;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer; import org.apache.pdfbox.rendering.PDFRenderer;
@ -211,7 +212,7 @@ public class Html2Image {
* call termination. Beyond this limit the process is killed. * call termination. Beyond this limit the process is killed.
* @return true when the destination file was successfully written * @return true when the destination file was successfully written
*/ */
public static boolean writeWkhtmltopdf(String url, String proxy, String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) { public static boolean writeWkhtmltopdf(final String url, final String proxy, final String userAgent, final String acceptLanguage, final File destination, final long maxSeconds) {
boolean success = false; boolean success = false;
for (final boolean ignoreErrors: new boolean[]{false, true}) { for (final boolean ignoreErrors: new boolean[]{false, true}) {
success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds); success = writeWkhtmltopdfInternal(url, proxy, destination, userAgent, acceptLanguage, ignoreErrors, maxSeconds);
@ -352,7 +353,7 @@ public class Html2Image {
// convert pdf to jpg using internal pdfbox capability // convert pdf to jpg using internal pdfbox capability
if (convertCmd == null) { if (convertCmd == null) {
try (final PDDocument pdoc = PDDocument.load(pdf);) { try (final PDDocument pdoc = Loader.loadPDF(pdf);) {
final BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB); final BufferedImage bi = new PDFRenderer(pdoc).renderImageWithDPI(0, density, ImageType.RGB);
@ -432,7 +433,7 @@ public class Html2Image {
* @param size * @param size
* @throws IOException * @throws IOException
*/ */
public static void writeSwingImage(String url, Dimension size, File destination) throws IOException { public static void writeSwingImage(final String url, final Dimension size, final File destination) throws IOException {
// set up a pane for rendering // set up a pane for rendering
final JEditorPane htmlPane = new JEditorPane(); final JEditorPane htmlPane = new JEditorPane();
@ -453,7 +454,7 @@ public class Html2Image {
public ViewFactory getViewFactory() { public ViewFactory getViewFactory() {
return new HTMLFactory() { return new HTMLFactory() {
@Override @Override
public View create(Element elem) { public View create(final Element elem) {
final View view = super.create(elem); final View view = super.create(elem);
if (view instanceof ImageView) { if (view instanceof ImageView) {
((ImageView) view).setLoadsSynchronously(true); ((ImageView) view).setLoadsSynchronously(true);
@ -467,7 +468,7 @@ public class Html2Image {
htmlPane.setContentType("text/html"); htmlPane.setContentType("text/html");
htmlPane.addPropertyChangeListener(new PropertyChangeListener() { htmlPane.addPropertyChangeListener(new PropertyChangeListener() {
@Override @Override
public void propertyChange(PropertyChangeEvent evt) { public void propertyChange(final PropertyChangeEvent evt) {
} }
}); });
@ -501,7 +502,7 @@ public class Html2Image {
* </li> * </li>
* </ol> * </ol>
*/ */
public static void main(String[] args) { public static void main(final String[] args) {
final String usageMessage = "Usage : java " + Html2Image.class.getName() final String usageMessage = "Usage : java " + Html2Image.class.getName()
+ " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]"; + " <url> <target-file[.pdf|.jpg|.png]> [wkhtmltopdf|swing]";
int exitStatus = 0; int exitStatus = 0;

@ -39,8 +39,10 @@ import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.io.MemoryUsageSetting; import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.io.RandomAccessReadBuffer;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
@ -69,7 +71,7 @@ public class pdfParser extends AbstractParser implements Parser {
public static boolean individualPages = false; public static boolean individualPages = false;
public static String individualPagePropertyname = "page"; public static String individualPagePropertyname = "page";
public pdfParser() { public pdfParser() {
super("Acrobat Portable Document Parser"); super("Acrobat Portable Document Parser");
this.SUPPORTED_EXTENSIONS.add("pdf"); this.SUPPORTED_EXTENSIONS.add("pdf");
@ -86,7 +88,7 @@ public class pdfParser extends AbstractParser implements Parser {
final DigestURL location, final DigestURL location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final VocabularyScraper scraper, final VocabularyScraper scraper,
final int timezoneOffset, final int timezoneOffset,
final InputStream source) throws Parser.Failure, InterruptedException { final InputStream source) throws Parser.Failure, InterruptedException {
@ -98,8 +100,8 @@ public class pdfParser extends AbstractParser implements Parser {
PDDocument pdfDoc; PDDocument pdfDoc;
try { try {
Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
MemoryUsageSetting mus = MemoryUsageSetting.setupMixed(200*1024*1024); final RandomAccessRead readBuffer = new RandomAccessReadBuffer(source);
pdfDoc = PDDocument.load(source, mus); pdfDoc = Loader.loadPDF(readBuffer);
} catch (final IOException e) { } catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location); throw new Parser.Failure(e.getMessage(), location);
} finally { } finally {
@ -141,34 +143,34 @@ public class pdfParser extends AbstractParser implements Parser {
if (docKeywordStr != null) { if (docKeywordStr != null) {
docKeywords = docKeywordStr.split(" |,"); docKeywords = docKeywordStr.split(" |,");
} }
Document[] result = null; Document[] result = null;
try { try {
// get the links // get the links
final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc); final List<Collection<AnchorURL>> pdflinks = extractPdfLinks(pdfDoc);
// get the fulltext (either per document or for each page) // get the fulltext (either per document or for each page)
final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/); final PDFTextStripper stripper = new PDFTextStripper(/*StandardCharsets.UTF_8.name()*/);
if (individualPages) { if (individualPages) {
// this is a hack which stores individual pages of the source pdf into individual index documents // this is a hack which stores individual pages of the source pdf into individual index documents
// the new documents will get a virtual link with a post argument page=X appended to the original url // the new documents will get a virtual link with a post argument page=X appended to the original url
// collect text // collect text
int pagecount = pdfDoc.getNumberOfPages(); final int pagecount = pdfDoc.getNumberOfPages();
String[] pages = new String[pagecount]; final String[] pages = new String[pagecount];
for (int page = 1; page <= pagecount; page++) { for (int page = 1; page <= pagecount; page++) {
stripper.setStartPage(page); stripper.setStartPage(page);
stripper.setEndPage(page); stripper.setEndPage(page);
pages[page - 1] = stripper.getText(pdfDoc); pages[page - 1] = stripper.getText(pdfDoc);
//System.out.println("PAGE " + page + ": " + pages[page - 1]); //System.out.println("PAGE " + page + ": " + pages[page - 1]);
} }
// create individual documents for each page // create individual documents for each page
assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size(); assert pages.length == pdflinks.size() : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.size();
result = new Document[Math.min(pages.length, pdflinks.size())]; result = new Document[Math.min(pages.length, pdflinks.size())];
String loc = location.toNormalform(true); final String loc = location.toNormalform(true);
for (int page = 0; page < result.length; page++) { for (int page = 0; page < result.length; page++) {
result[page] = new Document( result[page] = new Document(
new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash new AnchorURL(loc + (loc.indexOf('?') > 0 ? '&' : '?') + individualPagePropertyname + '=' + (page + 1)), // these are virtual new pages; we cannot combine them with '#' as that would be removed when computing the urlhash
mimeType, mimeType,
@ -216,9 +218,9 @@ public class pdfParser extends AbstractParser implements Parser {
contentBytes = writer.getBytes(); // get final text before closing writer contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); // free writer resources writer.close(); // free writer resources
} }
Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); final Collection<AnchorURL> pdflinksCombined = new HashSet<>();
for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); for (final Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
result = new Document[]{new Document( result = new Document[]{new Document(
location, location,
mimeType, mimeType,
@ -238,7 +240,7 @@ public class pdfParser extends AbstractParser implements Parser {
null, null,
false, false,
docDate)}; docDate)};
} }
} catch (final Throwable e) { } catch (final Throwable e) {
//throw new Parser.Failure(e.getMessage(), location); //throw new Parser.Failure(e.getMessage(), location);
} finally { } finally {
@ -248,7 +250,7 @@ public class pdfParser extends AbstractParser implements Parser {
// clear cached resources in pdfbox. // clear cached resources in pdfbox.
pdfDoc = null; pdfDoc = null;
clearPdfBoxCaches(); clearPdfBoxCaches();
return result; return result;
} }
@ -258,25 +260,25 @@ public class pdfParser extends AbstractParser implements Parser {
* @return all detected links * @return all detected links
*/ */
private List<Collection<AnchorURL>> extractPdfLinks(final PDDocument pdf) { private List<Collection<AnchorURL>> extractPdfLinks(final PDDocument pdf) {
List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages()); final List<Collection<AnchorURL>> linkCollections = new ArrayList<>(pdf.getNumberOfPages());
for (PDPage page : pdf.getPages()) { for (final PDPage page : pdf.getPages()) {
final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>(); final Collection<AnchorURL> pdflinks = new ArrayList<>();
try { try {
List<PDAnnotation> annotations = page.getAnnotations(); final List<PDAnnotation> annotations = page.getAnnotations();
if (annotations != null) { if (annotations != null) {
for (PDAnnotation pdfannotation : annotations) { for (final PDAnnotation pdfannotation : annotations) {
if (pdfannotation instanceof PDAnnotationLink) { if (pdfannotation instanceof PDAnnotationLink) {
PDAction link = ((PDAnnotationLink)pdfannotation).getAction(); final PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
if (link != null && link instanceof PDActionURI) { if (link != null && link instanceof PDActionURI) {
PDActionURI pdflinkuri = (PDActionURI) link; final PDActionURI pdflinkuri = (PDActionURI) link;
String uristr = pdflinkuri.getURI(); final String uristr = pdflinkuri.getURI();
AnchorURL url = new AnchorURL(uristr); final AnchorURL url = new AnchorURL(uristr);
pdflinks.add(url); pdflinks.add(url);
} }
} }
} }
} }
} catch (IOException ex) {} } catch (final IOException ex) {}
linkCollections.add(pdflinks); linkCollections.add(pdflinks);
} }
return linkCollections; return linkCollections;
@ -292,17 +294,17 @@ public class pdfParser extends AbstractParser implements Parser {
* situation is now from far better, but one (unnecessary?) cache structure in * situation is now from far better, but one (unnecessary?) cache structure in
* the COSName class still needs to be explicitely cleared. * the COSName class still needs to be explicitely cleared.
*/ */
// History of related issues : // History of related issues :
// http://markmail.org/thread/quk5odee4hbsauhu // http://markmail.org/thread/quk5odee4hbsauhu
// https://issues.apache.org/jira/browse/PDFBOX-313 // https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351 // https://issues.apache.org/jira/browse/PDFBOX-351
// https://issues.apache.org/jira/browse/PDFBOX-441 // https://issues.apache.org/jira/browse/PDFBOX-441
// https://issues.apache.org/jira/browse/PDFBOX-2200 // https://issues.apache.org/jira/browse/PDFBOX-2200
// https://issues.apache.org/jira/browse/PDFBOX-2149 // https://issues.apache.org/jira/browse/PDFBOX-2149
COSName.clearResources(); COSName.clearResources();
/* /*
* Prior to PDFBox 2.0.0, clearResources() function had to be called on the * Prior to PDFBox 2.0.0, clearResources() function had to be called on the
* org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version * org.apache.pdfbox.pdmodel.font.PDFont class and its children. After version
@ -327,7 +329,7 @@ public class pdfParser extends AbstractParser implements Parser {
// parse // parse
final AbstractParser parser = new pdfParser(); final AbstractParser parser = new pdfParser();
Document document = null; Document document = null;
FileInputStream inStream = null; FileInputStream inStream = null;
try { try {
inStream = new FileInputStream(pdfFile); inStream = new FileInputStream(pdfFile);
document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream)); document = Document.mergeDocuments(null, "application/pdf", parser.parse(null, "application/pdf", null, new VocabularyScraper(), 0, inStream));
@ -345,7 +347,7 @@ public class pdfParser extends AbstractParser implements Parser {
if(inStream != null) { if(inStream != null) {
try { try {
inStream.close(); inStream.close();
} catch(IOException e) { } catch(final IOException e) {
System.err.println("Could not close input stream on file " + pdfFile); System.err.println("Could not close input stream on file " + pdfFile);
} }
} }
@ -359,7 +361,7 @@ public class pdfParser extends AbstractParser implements Parser {
System.out.println("\t!!!Parsing without result!!!"); System.out.println("\t!!!Parsing without result!!!");
} else { } else {
System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors"); System.out.println("\tParsed text with " + document.getTextLength() + " chars of text and " + document.getAnchors().size() + " anchors");
InputStream textStream = document.getTextStream(); final InputStream textStream = document.getTextStream();
try { try {
// write file // write file
FileUtils.copy(textStream, new File("parsedPdf.txt")); FileUtils.copy(textStream, new File("parsedPdf.txt"));
@ -372,7 +374,7 @@ public class pdfParser extends AbstractParser implements Parser {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close(); textStream.close();
} }
} catch (IOException e) { } catch (final IOException e) {
ConcurrentLog.warn("PDFPARSER", "Could not close text input stream"); ConcurrentLog.warn("PDFPARSER", "Could not close text input stream");
} }
} }

Loading…
Cancel
Save