package net.yacy.document.parser;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Set;
import org.junit.Test;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
public class htmlParserTest extends TestCase {
@Test
public void testGetRealCharsetEncoding() {
String[][] testStrings = new String[][] {
new String[]{null,null},
new String[]{"windows1250","windows-1250"},
new String[]{"windows_1250","windows-1250"},
new String[]{"ISO-8859-1", StandardCharsets.ISO_8859_1.name()},
new String[]{"ISO8859-1", StandardCharsets.ISO_8859_1.name()},
new String[]{"ISO-88591", StandardCharsets.ISO_8859_1.name()},
new String[]{"ISO88591", StandardCharsets.ISO_8859_1.name()},
new String[]{"iso_8859_1", StandardCharsets.ISO_8859_1.name()},
new String[]{"cp-1252","windows-1252"},
new String[]{"gb_2312","gb2312"}, // was: x-EUC-CN
new String[]{"gb_2312-80","gb2312"}, // was: x-EUC-CN
new String[]{"UTF-8;", StandardCharsets.UTF_8.name()}
};
for (int i=0; i < testStrings.length; i++) {
// desired conversion result
String shouldBe = testStrings[i][1];
shouldBe = shouldBe!=null ? shouldBe.toLowerCase(Locale.ROOT) : null;
// conversion result
String charset = htmlParser.patchCharsetEncoding(testStrings[i][0]);
// test if equal
assertEquals(shouldBe, charset!=null ? charset.toLowerCase(Locale.ROOT) : null);
System.out.println("testGetRealCharsetEncoding: " + (testStrings[i][0]!=null?testStrings[i][0]:"null") + " -> " + (charset!=null?charset:"null") + " | Supported: " + (charset!=null?Charset.isSupported(charset):false));
}
}
/**
* Test of parse method, of class htmlParser.
* - test getCharset
* @throws IOException
*/
@Test
public void testParse() throws Parser.Failure, InterruptedException, IOException {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
for (String testfile : testFiles) {
final String filename = "test/parsertest/" + testfile;
final File file = new File(filename);
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
System.out.println("parse file: " + filename);
htmlParser p = new htmlParser();
FileInputStream inStream = null;
try {
inStream = new FileInputStream(file);
final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, inStream);
Document doc = docs[0];
String txt = doc.getCharset();
assertTrue("get Charset", txt != null);
System.out.println("detected charset = " + txt);
} finally {
if(inStream != null) {
inStream.close();
}
}
}
}
/**
* Test the htmlParser.parse() method, with no charset information, neither
* provided by HTTP header nor by meta tags or attributes.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseHtmlWithoutCharset() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final StringBuilder testHtml = new StringBuilder("
");
/*
* Include some non ASCII characters : once encoded they should make the charset
* detector to detect the exact encoding
*/
testHtml.append("In München steht ein Hofbräuhaus.\n" + "Dort gibt es Bier aus Maßkrügen. ");
testHtml.append("First link");
testHtml.append("Second link");
testHtml.append("Third link");
testHtml.append("
");
final htmlParser parser = new htmlParser();
final Charset[] charsets = new Charset[] { StandardCharsets.UTF_8, StandardCharsets.ISO_8859_1 };
for (final Charset charset : charsets) {
try (InputStream sourceStream = new ByteArrayInputStream(testHtml.toString().getBytes(charset));) {
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
assertEquals(3, doc.getAnchors().size());
assertTrue(doc.getTextString().contains("Maßkrügen"));
assertEquals(charset.toString(), doc.getCharset());
}
}
}
/**
* Test the htmlParser.parse() method, when filtering out div elements on their CSS class.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseHtmlDivClassFilter() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final StringBuilder testHtml = new StringBuilder("Test document");
testHtml.append("
Top text");
testHtml.append("Top link");
testHtml.append("
");
testHtml.append("");
final htmlParser parser = new htmlParser();
/* No CSS class filter */
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
final String parsedDext = doc.getTextString();
/* Check everything has been parsed */
assertEquals(2, doc.getAnchors().size());
assertEquals(1, doc.getImages().size());
assertEquals(1, doc.getLinkedDataTypes().size());
assertTrue(parsedDext.contains("Top"));
assertTrue(parsedDext.contains("Some"));
assertTrue(parsedDext.contains("from"));
assertTrue(parsedDext.contains("paragraph"));
assertTrue(parsedDext.contains("Text-only"));
assertTrue(parsedDext.contains("depth"));
}
/* Filter on CSS classes with no matching elements */
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Set ignore = new HashSet<>();
ignore.add("opt");
ignore.add("head");
ignore.add("container");
final Document[] docs = parser.parse(url, mimetype, null, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
final String parsedDext = doc.getTextString();
/* Check everything has been parsed */
assertEquals(2, doc.getAnchors().size());
assertEquals(1, doc.getImages().size());
assertEquals(1, doc.getLinkedDataTypes().size());
assertTrue(parsedDext.contains("Top"));
assertTrue(parsedDext.contains("Some"));
assertTrue(parsedDext.contains("from"));
assertTrue(parsedDext.contains("paragraph"));
assertTrue(parsedDext.contains("Text-only"));
assertTrue(parsedDext.contains("depth"));
}
/* Filter on CSS class with matching elements */
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Set ignore = new HashSet<>();
ignore.add("optional");
final Document[] docs = parser.parse(url, mimetype, null, ignore, new VocabularyScraper(), 0, sourceStream);
final Document doc = docs[0];
final String parsedDext = doc.getTextString();
/* Check matching blocks have been ignored */
assertEquals(1, doc.getAnchors().size());
assertEquals("http://localhost/top.html", doc.getAnchors().iterator().next().toString());
assertEquals(0, doc.getLinkedDataTypes().size());
assertEquals(0, doc.getImages().size());
assertFalse(parsedDext.contains("Some"));
assertFalse(parsedDext.contains("from"));
assertFalse(parsedDext.contains("depth"));
/* Check non-matching blocks have been normally parsed */
assertTrue(parsedDext.contains("Top"));
assertTrue(parsedDext.contains("Text-only"));
assertTrue(parsedDext.contains("paragraph"));
}
}
/**
* Test the htmlParser.parseWithLimits() method with test content within bounds.
* @throws Exception when an unexpected error occurred
*/
@Test
public void testParseWithLimitsUnreached() throws Exception {
System.out.println("htmlParser.parse");
String[] testFiles = {
"umlaute_html_iso.html",
"umlaute_html_utf8.html",
"umlaute_html_namedentities.html"};
final String mimetype = "text/html";
//final String resulttxt = "In München steht ein Hofbräuhaus. Dort gibt es Bier aus Maßkrügen.";
htmlParser parser = new htmlParser();
for (final String testfile : testFiles) {
final String fileName = "test" + File.separator + "parsertest" + File.separator + testfile;
final File file = new File(fileName);
final AnchorURL url = new AnchorURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(file);) {
final Document[] docs = parser.parseWithLimits(url, mimetype, null, new VocabularyScraper(), 0, inStream, 1000, 10000);
final Document doc = docs[0];
assertNotNull("Parser result must not be null for file " + fileName, docs);
final String parsedText = doc.getTextString();
assertNotNull("Parsed text must not be empty for file " + fileName, parsedText);
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
parsedText.contains("Maßkrügen"));
assertEquals("Test anchor must have been parsed for file " + fileName, 1, doc.getAnchors().size());
assertFalse("Parsed document should not be marked as partially parsed for file " + fileName, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero to the exact anchors number contained in the test content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnAnchors() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("
");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getAnchors().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/**
* Test the htmlParser.parseWithLimits() method, with various maxLinks values
* ranging from zero the exact RSS feed links number contained in the test
* content.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimitsOnRSSFeeds() throws Exception {
final AnchorURL url = new AnchorURL("http://localhost/test.html");
final String mimetype = "text/html";
final String charset = StandardCharsets.UTF_8.name();
final StringBuilder testHtml = new StringBuilder("");
testHtml.append("");
testHtml.append(
"");
testHtml.append(
"");
testHtml.append(
"");
testHtml.append("");
testHtml.append("
HTML test content
");
final htmlParser parser = new htmlParser();
for (int maxLinks = 0; maxLinks <= 3; maxLinks++) {
try (InputStream sourceStream = new ByteArrayInputStream(
testHtml.toString().getBytes(StandardCharsets.UTF_8));) {
final Document[] docs = parser.parseWithLimits(url, mimetype, charset, new VocabularyScraper(), 0,
sourceStream, maxLinks, Long.MAX_VALUE);
final Document doc = docs[0];
assertEquals(maxLinks, doc.getRSS().size());
assertEquals("The parsed document should be marked as partially parsed only when the limit is exceeded",
maxLinks < 3, doc.isPartiallyParsed());
}
}
}
/**
* Test of parseToScraper method, of class htmlParser.
*/
@Test
public void testParseToScraper_4args() throws Exception {
// test link with inline html in text
// expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
final AnchorURL url = new AnchorURL("http://localhost/");
final String charset = StandardCharsets.UTF_8.name();
final String testhtml = ""
+ "testtext" // "testtext"
+ " Start" // "Start"
+ "" // "" + image
+ "