Improve pdf text extraction resource handling.

For sort pdf <= 3 pages use already extracted content,
only for long pdf > 3 pages reassign content and close internal writer (to direct free buffers)
pull/114/head
reger 8 years ago
parent 52c9d0c858
commit de1c1c16db

@ -63,6 +63,7 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.io.CharBuffer; import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.pdmodel.PDPageTree;
public class pdfParser extends AbstractParser implements Parser { public class pdfParser extends AbstractParser implements Parser {
@ -219,8 +220,9 @@ public class pdfParser extends AbstractParser implements Parser {
t.start(); t.start();
t.join(3000); // pdfbox likes to forget to terminate ... (quite often) t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
if (t.isAlive()) t.interrupt(); if (t.isAlive()) t.interrupt();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close(); // free writer resources
} }
contentBytes = writer.getBytes(); // get final text before closing writer
Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>(); Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx); for (Collection<AnchorURL> pdflinksx: pdflinks) if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
@ -245,7 +247,6 @@ public class pdfParser extends AbstractParser implements Parser {
docDate)}; docDate)};
} }
} catch (final Throwable e) { } catch (final Throwable e) {
//close the writer (in finally)
//throw new Parser.Failure(e.getMessage(), location); //throw new Parser.Failure(e.getMessage(), location);
} finally { } finally {
try {pdfDoc.close();} catch (final Throwable e) {} try {pdfDoc.close();} catch (final Throwable e) {}

Loading…
Cancel
Save