allow url=wwwl?param=a¶m=b (with ?, & encoded) fix: http://mantis.tokeek.de/view.php?id=100 fix double adding of '&' in MultiProtocolURL.escape()pull/1/head
parent
b31db00010
commit
aa2e15d846
@ -0,0 +1,46 @@
|
|||||||
|
package net.yacy.document.parser;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileInputStream;
|
||||||
|
import java.util.Collection;
|
||||||
|
import static junit.framework.TestCase.assertEquals;
|
||||||
|
import net.yacy.cora.document.id.AnchorURL;
|
||||||
|
import net.yacy.document.Document;
|
||||||
|
import org.junit.Test;
|
||||||
|
|
||||||
|
public class pdfParserTest {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test extraction of links in parse method, of class pdfParser.
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testParse() throws Exception {
|
||||||
|
System.out.println("pdfParser.parse");
|
||||||
|
|
||||||
|
final String testFiles = "umlaute_linux.pdf";
|
||||||
|
final String mimetype = "application/pdf";
|
||||||
|
final String charset = null;
|
||||||
|
|
||||||
|
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
|
||||||
|
final String filename = "test/parsertest/" + testFiles;
|
||||||
|
final File file = new File(filename);
|
||||||
|
|
||||||
|
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
|
||||||
|
System.out.println("parse file: " + filename);
|
||||||
|
|
||||||
|
pdfParser p = new pdfParser();
|
||||||
|
final Document[] docs = p.parse(url, mimetype, charset, new FileInputStream(file));
|
||||||
|
|
||||||
|
Document doc = docs[0];
|
||||||
|
int ilinks = doc.getAnchors().size();
|
||||||
|
assertEquals("number of links in pdf", 1, ilinks);
|
||||||
|
|
||||||
|
Collection<AnchorURL> links = doc.getAnchors();
|
||||||
|
System.out.println("number of links detected = " + ilinks);
|
||||||
|
for (AnchorURL aurl : links) {
|
||||||
|
System.out.println(" found: " + aurl.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Binary file not shown.
Loading…
Reference in new issue