diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index af2d0e200..1613eb8bf 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -33,14 +33,22 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.lang.reflect.Method; +import java.util.ArrayList; +import java.util.Collection; import java.util.Date; +import java.util.List; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction; +import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink; import org.apache.pdfbox.util.PDFTextStripper; import net.yacy.cora.document.id.AnchorURL; @@ -135,9 +143,10 @@ public class pdfParser extends AbstractParser implements Parser { } final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE); byte[] contentBytes = new byte[0]; + Collection pdflinks = null; try { // create a writer for output - final PDFTextStripper stripper = new PDFTextStripper(); + final PDFTextStripper stripper = new PDFTextStripper("UTF-8"); stripper.setEndPage(3); // get first 3 pages (always) writer.append(stripper.getText(pdfDoc)); @@ -162,10 +171,9 @@ public class pdfParser extends AbstractParser implements Parser { if (t.isAlive()) t.interrupt(); } contentBytes = writer.getBytes(); // get final text before closing writer + pdflinks = extractPdfLinks(pdfDoc); } catch (final Throwable e) { - // close the writer - if (writer != null) try { writer.close(); } catch (final Exception ex) {} - try {pdfDoc.close();} catch (final Throwable ee) {} + // close the writer (in finally) //throw new Parser.Failure(e.getMessage(), location); } finally { try {pdfDoc.close();} catch (final Throwable e) {} @@ -207,13 +215,43 @@ public class pdfParser extends AbstractParser implements Parser { null, 0.0f, 0.0f, contentBytes, - null, + (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks, null, null, false, docDate)}; } - + + /** + * extract clickable links from pdf + * @param pdf the document to parse + * @return all detected links + */ + private Collection extractPdfLinks(final PDDocument pdf) { + final Collection pdflinks = new ArrayList(); + @SuppressWarnings("unchecked") + List allPages = pdf.getDocumentCatalog().getAllPages(); + for (PDPage page : allPages) { + try { + List annotations = page.getAnnotations(); + if (annotations != null) { + for (PDAnnotation pdfannotation : annotations) { + if (pdfannotation instanceof PDAnnotationLink) { + PDAction link = ((PDAnnotationLink)pdfannotation).getAction(); + if (link != null && link instanceof PDActionURI) { + PDActionURI pdflinkuri = (PDActionURI) link; + String uristr = pdflinkuri.getURI(); + AnchorURL url = new AnchorURL(uristr); + pdflinks.add(url); + } + } + } + } + } catch (IOException ex) {} + } + return pdflinks; + } + public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! diff --git a/test/net/yacy/document/parser/pdfParserTest.java b/test/net/yacy/document/parser/pdfParserTest.java new file mode 100644 index 000000000..e2c419035 --- /dev/null +++ b/test/net/yacy/document/parser/pdfParserTest.java @@ -0,0 +1,46 @@ +package net.yacy.document.parser; + +import java.io.File; +import java.io.FileInputStream; +import java.util.Collection; +import static junit.framework.TestCase.assertEquals; +import net.yacy.cora.document.id.AnchorURL; +import net.yacy.document.Document; +import org.junit.Test; + +public class pdfParserTest { + + /** + * Test extraction of links in parse method, of class pdfParser. + */ + @Test + public void testParse() throws Exception { + System.out.println("pdfParser.parse"); + + final String testFiles = "umlaute_linux.pdf"; + final String mimetype = "application/pdf"; + final String charset = null; + + //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen."; + final String filename = "test/parsertest/" + testFiles; + final File file = new File(filename); + + final AnchorURL url = new AnchorURL("http://localhost/" + filename); + System.out.println("parse file: " + filename); + + pdfParser p = new pdfParser(); + final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file)); + + Document doc = docs[0]; + int ilinks = doc.getAnchors().size(); + assertEquals("number of links in pdf", 1, ilinks); + + Collection links = doc.getAnchors(); + System.out.println("number of links detected = " + ilinks); + for (AnchorURL aurl : links) { + System.out.println(" found: " + aurl.toString()); + } + + } + +} diff --git a/test/parsertest/umlaute_linux.pdf b/test/parsertest/umlaute_linux.pdf index 7e6925ee4..65fe8487b 100755 Binary files a/test/parsertest/umlaute_linux.pdf and b/test/parsertest/umlaute_linux.pdf differ