optimize pdfParser

skip starting reader thread if all content already read
11 years ago · 3b559e7846
parent 09f73b790f
commit 3b559e7846
1 changed files with 18 additions and 17 deletions
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -143,23 +143,24 @@ public class pdfParser extends AbstractParser implements Parser {
            writer.append(stripper.getText(pdfDoc));
            contentBytes = writer.getBytes(); // remember text in case of interrupting thread

-            stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
-            stripper.setEndPage(Integer.MAX_VALUE); // set to default
-            // we start the pdf parsing in a separate thread to ensure that it can be terminated
-            final PDDocument pdfDocC = pdfDoc;
-            final Thread t = new Thread() {
-                @Override
-                public void run() {
-                    Thread.currentThread().setName("pdfParser.getText:" + location);
-                    try {
-                        writer.append(stripper.getText(pdfDocC));
-                    } catch (final Throwable e) {}
-                }
-            };
-            t.start();
-            t.join(3000);
-            if (t.isAlive()) t.interrupt();
-            pdfDoc.close();
+            if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
+                stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
+                stripper.setEndPage(Integer.MAX_VALUE); // set to default
+                // we start the pdf parsing in a separate thread to ensure that it can be terminated
+                final PDDocument pdfDocC = pdfDoc;
+                final Thread t = new Thread() {
+                    @Override
+                    public void run() {
+                        Thread.currentThread().setName("pdfParser.getText:" + location);
+                        try {
+                            writer.append(stripper.getText(pdfDocC));
+                        } catch (final Throwable e) {}
+                    }
+                };
+                t.start();
+                t.join(3000);
+                if (t.isAlive()) t.interrupt();
+            }
            contentBytes = writer.getBytes(); // get final text before closing writer
        } catch (final Throwable e) {
            // close the writer