allow url parameter in worktable apicall

allow url=wwwl?param=a&param=b (with ?, & encoded) fix: http://mantis.tokeek.de/view.php?id=100 fix double adding of '&' in MultiProtocolURL.escape()
11 years ago · aa2e15d846
parent b31db00010
commit aa2e15d846
3 changed files with 90 additions and 6 deletions
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -33,14 +33,22 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.lang.reflect.Method;
+import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Date;
+import java.util.List;

 import org.apache.pdfbox.exceptions.CryptographyException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
+import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
+import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
+import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
+import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.util.PDFTextStripper;

 import net.yacy.cora.document.id.AnchorURL;
@ -135,9 +143,10 @@ public class pdfParser extends AbstractParser implements Parser {
        }
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
+        Collection<AnchorURL> pdflinks = null;
        try {
            // create a writer for output
-            final PDFTextStripper  stripper = new PDFTextStripper();
+            final PDFTextStripper  stripper = new PDFTextStripper("UTF-8");

            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
@ -162,10 +171,9 @@ public class pdfParser extends AbstractParser implements Parser {
                if (t.isAlive()) t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer
+            pdflinks = extractPdfLinks(pdfDoc);
        } catch (final Throwable e) {
-            // close the writer
-            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
-            try {pdfDoc.close();} catch (final Throwable ee) {}
+            // close the writer (in finally)
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
            try {pdfDoc.close();} catch (final Throwable e) {}
@ -207,13 +215,43 @@ public class pdfParser extends AbstractParser implements Parser {
                null,
                0.0f, 0.0f,
                contentBytes,
-                null,
+                (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks,
                null,
                null,
                false,
                docDate)};
    }
-    
+
+    /**
+     * extract clickable links from pdf
+     * @param pdf the document to parse
+     * @return all detected links
+     */
+    private Collection<AnchorURL> extractPdfLinks(final PDDocument pdf) {
+        final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
+        @SuppressWarnings("unchecked")
+        List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
+        for (PDPage page : allPages) {
+            try {
+                List<PDAnnotation> annotations = page.getAnnotations();
+                if (annotations != null) {
+                    for (PDAnnotation pdfannotation : annotations) {
+                        if (pdfannotation instanceof PDAnnotationLink) {
+                            PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
+                            if (link != null && link instanceof PDActionURI) {
+                                PDActionURI pdflinkuri = (PDActionURI) link;
+                                String uristr = pdflinkuri.getURI();
+                                AnchorURL url = new AnchorURL(uristr);
+                                pdflinks.add(url);
+                            }
+                        }
+                    }
+                }
+            } catch (IOException ex) {}
+        }
+        return pdflinks;
+    }
+
    public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
        // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
        // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
--- a/test/net/yacy/document/parser/pdfParserTest.java
+++ b/test/net/yacy/document/parser/pdfParserTest.java
@ -0,0 +1,46 @@
+package net.yacy.document.parser;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.util.Collection;
+import static junit.framework.TestCase.assertEquals;
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.document.Document;
+import org.junit.Test;
+
+public class pdfParserTest {
+
+    /**
+     * Test extraction of links in parse method, of class pdfParser.
+     */
+    @Test
+    public void testParse() throws Exception {
+        System.out.println("pdfParser.parse");
+
+        final String testFiles = "umlaute_linux.pdf";
+        final String mimetype = "application/pdf";
+        final String charset = null;
+
+        //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
+        final String filename = "test/parsertest/" + testFiles;
+        final File file = new File(filename);
+
+        final AnchorURL url = new AnchorURL("http://localhost/" + filename);
+        System.out.println("parse file: " + filename);
+
+        pdfParser p = new pdfParser();
+        final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file));
+
+        Document doc = docs[0];
+        int ilinks = doc.getAnchors().size();
+        assertEquals("number of links in pdf", 1, ilinks);
+        
+        Collection<AnchorURL> links = doc.getAnchors();
+        System.out.println("number of links detected = " + ilinks);
+        for (AnchorURL aurl : links) {
+            System.out.println("   found: " + aurl.toString());
+        }
+
+    }
+
+}
--- a/test/parsertest/umlaute_linux.pdf
+++ b/test/parsertest/umlaute_linux.pdf