allow url parameter in worktable apicall

allow url=wwwl?param=a&param=b (with ?, & encoded)
fix:  http://mantis.tokeek.de/view.php?id=100

fix double adding of  '&' in MultiProtocolURL.escape()
pull/1/head
reger 10 years ago
parent b31db00010
commit aa2e15d846

@ -33,14 +33,22 @@ import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.List;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDAction;
import org.apache.pdfbox.pdmodel.interactive.action.type.PDActionURI;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.id.AnchorURL;
@ -135,9 +143,10 @@ public class pdfParser extends AbstractParser implements Parser {
}
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
Collection<AnchorURL> pdflinks = null;
try {
// create a writer for output
final PDFTextStripper stripper = new PDFTextStripper();
final PDFTextStripper stripper = new PDFTextStripper("UTF-8");
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
@ -162,10 +171,9 @@ public class pdfParser extends AbstractParser implements Parser {
if (t.isAlive()) t.interrupt();
}
contentBytes = writer.getBytes(); // get final text before closing writer
pdflinks = extractPdfLinks(pdfDoc);
} catch (final Throwable e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (final Throwable ee) {}
// close the writer (in finally)
//throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (final Throwable e) {}
@ -207,13 +215,43 @@ public class pdfParser extends AbstractParser implements Parser {
null,
0.0f, 0.0f,
contentBytes,
null,
(pdflinks == null || pdflinks.isEmpty()) ? null : pdflinks,
null,
null,
false,
docDate)};
}
/**
* extract clickable links from pdf
* @param pdf the document to parse
* @return all detected links
*/
private Collection<AnchorURL> extractPdfLinks(final PDDocument pdf) {
final Collection<AnchorURL> pdflinks = new ArrayList<AnchorURL>();
@SuppressWarnings("unchecked")
List<PDPage> allPages = pdf.getDocumentCatalog().getAllPages();
for (PDPage page : allPages) {
try {
List<PDAnnotation> annotations = page.getAnnotations();
if (annotations != null) {
for (PDAnnotation pdfannotation : annotations) {
if (pdfannotation instanceof PDAnnotationLink) {
PDAction link = ((PDAnnotationLink)pdfannotation).getAction();
if (link != null && link instanceof PDActionURI) {
PDActionURI pdflinkuri = (PDActionURI) link;
String uristr = pdflinkuri.getURI();
AnchorURL url = new AnchorURL(uristr);
pdflinks.add(url);
}
}
}
}
} catch (IOException ex) {}
}
return pdflinks;
}
public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() {
// thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever
// AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT!

@ -0,0 +1,46 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileInputStream;
import java.util.Collection;
import static junit.framework.TestCase.assertEquals;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import org.junit.Test;
public class pdfParserTest {
/**
* Test extraction of links in parse method, of class pdfParser.
*/
@Test
public void testParse() throws Exception {
System.out.println("pdfParser.parse");
final String testFiles = "umlaute_linux.pdf";
final String mimetype = "application/pdf";
final String charset = null;
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
final String filename = "test/parsertest/" + testFiles;
final File file = new File(filename);
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
System.out.println("parse file: " + filename);
pdfParser p = new pdfParser();
final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file));
Document doc = docs[0];
int ilinks = doc.getAnchors().size();
assertEquals("number of links in pdf", 1, ilinks);
Collection<AnchorURL> links = doc.getAnchors();
System.out.println("number of links detected = " + ilinks);
for (AnchorURL aurl : links) {
System.out.println(" found: " + aurl.toString());
}
}
}

Binary file not shown.
Loading…
Cancel
Save