From de1c1c16db6f98fdf9f14860934f99938bc7b962 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 9 Mar 2017 22:56:33 +0100 Subject: [PATCH] Improve pdf text extraction resource handling. For sort pdf <= 3 pages use already extracted content, only for long pdf > 3 pages reassign content and close internal writer (to direct free buffers) --- source/net/yacy/document/parser/pdfParser.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 863e9112e..bc225a7ed 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -63,6 +63,7 @@ import net.yacy.document.VocabularyScraper; import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.MemoryControl; +import org.apache.pdfbox.pdmodel.PDPageTree; public class pdfParser extends AbstractParser implements Parser { @@ -219,9 +220,10 @@ public class pdfParser extends AbstractParser implements Parser { t.start(); t.join(3000); // pdfbox likes to forget to terminate ... (quite often) if (t.isAlive()) t.interrupt(); + contentBytes = writer.getBytes(); // get final text before closing writer + writer.close(); // free writer resources } - contentBytes = writer.getBytes(); // get final text before closing writer - + Collection pdflinksCombined = new HashSet(); for (Collection pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); result = new Document[]{new Document( @@ -245,7 +247,6 @@ public class pdfParser extends AbstractParser implements Parser { docDate)}; } } catch (final Throwable e) { - //close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final Throwable e) {}