allow url=wwwl?param=a¶m=b (with ?, & encoded) fix: http://mantis.tokeek.de/view.php?id=100 fix double adding of '&' in MultiProtocolURL.escape()pull/1/head
parent
b31db00010
commit
aa2e15d846
@ -0,0 +1,46 @@
|
||||
package net.yacy.document.parser;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.util.Collection;
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
import net.yacy.cora.document.id.AnchorURL;
|
||||
import net.yacy.document.Document;
|
||||
import org.junit.Test;
|
||||
|
||||
public class pdfParserTest {
|
||||
|
||||
/**
|
||||
* Test extraction of links in parse method, of class pdfParser.
|
||||
*/
|
||||
@Test
|
||||
public void testParse() throws Exception {
|
||||
System.out.println("pdfParser.parse");
|
||||
|
||||
final String testFiles = "umlaute_linux.pdf";
|
||||
final String mimetype = "application/pdf";
|
||||
final String charset = null;
|
||||
|
||||
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
|
||||
final String filename = "test/parsertest/" + testFiles;
|
||||
final File file = new File(filename);
|
||||
|
||||
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
|
||||
System.out.println("parse file: " + filename);
|
||||
|
||||
pdfParser p = new pdfParser();
|
||||
final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file));
|
||||
|
||||
Document doc = docs[0];
|
||||
int ilinks = doc.getAnchors().size();
|
||||
assertEquals("number of links in pdf", 1, ilinks);
|
||||
|
||||
Collection<AnchorURL> links = doc.getAnchors();
|
||||
System.out.println("number of links detected = " + ilinks);
|
||||
for (AnchorURL aurl : links) {
|
||||
System.out.println(" found: " + aurl.toString());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
Loading…
Reference in new issue