|
|
|
@ -5,11 +5,15 @@ import java.io.FileInputStream;
|
|
|
|
|
import java.io.FileNotFoundException;
|
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import static junit.framework.Assert.assertEquals;
|
|
|
|
|
import static junit.framework.Assert.assertTrue;
|
|
|
|
|
import junit.framework.TestCase;
|
|
|
|
|
import net.yacy.cora.document.id.AnchorURL;
|
|
|
|
|
import net.yacy.document.Document;
|
|
|
|
|
import net.yacy.document.Parser;
|
|
|
|
|
import net.yacy.document.parser.html.ContentScraper;
|
|
|
|
|
import static net.yacy.document.parser.htmlParser.parseToScraper;
|
|
|
|
|
import org.junit.Test;
|
|
|
|
|
|
|
|
|
|
public class htmlParserTest extends TestCase {
|
|
|
|
@ -80,4 +84,35 @@ public class htmlParserTest extends TestCase {
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Test of parseToScraper method, of class htmlParser.
|
|
|
|
|
*/
|
|
|
|
|
@Test
|
|
|
|
|
public void testParseToScraper_4args() throws Exception {
|
|
|
|
|
// test link with inline html in text
|
|
|
|
|
// expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
|
|
|
|
|
final AnchorURL url = new AnchorURL("http://localhost/");
|
|
|
|
|
final String mimetype = "text/html";
|
|
|
|
|
final String testhtml = "<html><bod>"
|
|
|
|
|
+ "<a href='x1.html'><span>testtext</span></a>" // "testtext"
|
|
|
|
|
+ "<a href=\"http://localhost/x2.html\"> <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
|
|
|
|
|
+ "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // "" + image
|
|
|
|
|
+ "</body></html>";
|
|
|
|
|
|
|
|
|
|
ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
|
|
|
|
|
List<AnchorURL> anchorlist = scraper.getAnchors();
|
|
|
|
|
|
|
|
|
|
String linktxt = anchorlist.get(0).getTextProperty();
|
|
|
|
|
assertEquals("testtext", linktxt);
|
|
|
|
|
|
|
|
|
|
linktxt = anchorlist.get(1).getTextProperty();
|
|
|
|
|
assertEquals("Start", linktxt);
|
|
|
|
|
|
|
|
|
|
linktxt = anchorlist.get(2).getTextProperty();
|
|
|
|
|
assertEquals("", linktxt);
|
|
|
|
|
|
|
|
|
|
int cnt = scraper.getImages().size();
|
|
|
|
|
assertEquals(1,cnt);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|