You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
47 lines
1.5 KiB
47 lines
1.5 KiB
package net.yacy.document.parser;
|
|
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.util.Collection;
|
|
import static junit.framework.TestCase.assertEquals;
|
|
import net.yacy.cora.document.id.AnchorURL;
|
|
import net.yacy.document.Document;
|
|
import org.junit.Test;
|
|
|
|
public class pdfParserTest {
|
|
|
|
/**
|
|
* Test extraction of links in parse method, of class pdfParser.
|
|
*/
|
|
@Test
|
|
public void testParse() throws Exception {
|
|
System.out.println("pdfParser.parse");
|
|
|
|
final String testFiles = "umlaute_linux.pdf";
|
|
final String mimetype = "application/pdf";
|
|
final String charset = null;
|
|
|
|
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
|
|
final String filename = "test/parsertest/" + testFiles;
|
|
final File file = new File(filename);
|
|
|
|
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
|
|
System.out.println("parse file: " + filename);
|
|
|
|
pdfParser p = new pdfParser();
|
|
final Document[] docs = p.parse(url, mimetype, charset, null, new FileInputStream(file));
|
|
|
|
Document doc = docs[0];
|
|
int ilinks = doc.getAnchors().size();
|
|
assertEquals("number of links in pdf", 1, ilinks);
|
|
|
|
Collection<AnchorURL> links = doc.getAnchors();
|
|
System.out.println("number of links detected = " + ilinks);
|
|
for (AnchorURL aurl : links) {
|
|
System.out.println(" found: " + aurl.toString());
|
|
}
|
|
|
|
}
|
|
|
|
}
|