allow url parameter in worktable apicall

allow url=wwwl?param=a&param=b (with ?, & encoded) fix: http://mantis.tokeek.de/view.php?id=100 fix double adding of '&' in MultiProtocolURL.escape()
10 years ago · aa2e15d846
parent b31db00010
commit aa2e15d846
3 changed files with 90 additions and 6 deletions
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -33,14 +33,22 @@ import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Date;
 import java.util.List;
 import org.apache.pdfbox.exceptions.CryptographyException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDDocumentInformation;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
 import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
 import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.util.PDFTextStripper;
 import net.yacy.cora.document.id.AnchorURL;
@ -135,9 +143,10 @@ public class pdfParser extends AbstractParser implements Parser {
        }
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        Collection<AnchorURL> pdflinks = null;
        try {
            // create a writer for output
-            final PDFTextStripper  stripper = new PDFTextStripper();
+            final PDFTextStripper  stripper = new PDFTextStripper("UTF-8");
            stripper.setEndPage(3); // get first 3 pages (always)
            writer.append(stripper.getText(pdfDoc));
@ -162,10 +171,9 @@ public class pdfParser extends AbstractParser implements Parser {
                if (t.isAlive()) t.interrupt();
            }
            contentBytes = writer.getBytes(); // get final text before closing writer
            pdflinks = extractPdfLinks(pdfDoc);
        } catch (final Throwable e) {
-            // close the writer
+            // close the writer (in finally)
            if (writer != null) try { writer.close(); } catch (final Exception ex) {}
            try {pdfDoc.close();} catch (final Throwable ee) {}
            //throw new Parser.Failure(e.getMessage(), location);
        } finally {
            try {pdfDoc.close();} catch (final Throwable e) {}
@ -207,13 +215,43 @@ public class pdfParser extends AbstractParser implements Parser {
                null,
                0.0f, 0.0f,
                contentBytes,
-                null,
+                (pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks,
                null,
                null,
                false,
                docDate)};
    }
-    
+
    /**
     * extract clickable links from pdf
     * @param pdf the document to parse
     * @return all detected links
     */
    private Collection<AnchorURL> extractPdfLinks(final PDDocument pdf) {
        final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
        @SuppressWarnings("unchecked")
        List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
        for (PDPage page : allPages) {
            try {
                List<PDAnnotation> annotations = page.getAnnotations();
                if (annotations != null) {
                    for (PDAnnotation pdfannotation : annotations) {
                        if (pdfannotation instanceof PDAnnotationLink) {
                            PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
                            if (link != null && link instanceof PDActionURI) {
                                PDActionURI pdflinkuri = (PDActionURI) link;
                                String uristr = pdflinkuri.getURI();
                                AnchorURL url = new AnchorURL(uristr);
                                pdflinks.add(url);
                            }
                        }
                    }
                }
            } catch (IOException ex) {}
        }
        return pdflinks;
    }
    public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
        // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
        // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!
--- a/test/net/yacy/document/parser/pdfParserTest.java
+++ b/test/net/yacy/document/parser/pdfParserTest.java
@ -0,0 +1,46 @@
 package net.yacy.document.parser;
 import java.io.File;
 import java.io.FileInputStream;
 import java.util.Collection;
 import static junit.framework.TestCase.assertEquals;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.document.Document;
 import org.junit.Test;
 public class pdfParserTest {
    /**
     * Test extraction of links in parse method, of class pdfParser.
     */
    @Test
    public void testParse() throws Exception {
        System.out.println("pdfParser.parse");
        final String testFiles = "umlaute_linux.pdf";
        final String mimetype = "application/pdf";
        final String charset = null;
        //final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
        final String filename = "test/parsertest/" + testFiles;
        final File file = new File(filename);
        final AnchorURL url = new AnchorURL("http://localhost/" + filename);
        System.out.println("parse file: " + filename);
        pdfParser p = new pdfParser();
        final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file));
        Document doc = docs[0];
        int ilinks = doc.getAnchors().size();
        assertEquals("number of links in pdf", 1, ilinks);
        Collection<AnchorURL> links = doc.getAnchors();
        System.out.println("number of links detected = " + ilinks);
        for (AnchorURL aurl : links) {
            System.out.println("   found: " + aurl.toString());
        }
    }
 }
--- a/test/parsertest/umlaute_linux.pdf
+++ b/test/parsertest/umlaute_linux.pdf