diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 8f9833da4..b4a657672 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -520,7 +520,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; tag.opts.put("rel", rel); } - tag.opts.put("text", new String(tag.content.getChars())); + tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like " test " tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute url.setAll(tag.opts); recursiveParse(url, tag.content.getChars()); diff --git a/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java b/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java index 5f6c0b66f..37b7899c0 100644 --- a/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java +++ b/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java @@ -5,8 +5,6 @@ import java.io.IOException; import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.search.schema.CollectionSchema; import net.yacy.search.schema.WebgraphSchema; -import org.apache.solr.common.SolrDocumentList; -import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; import org.junit.After; import org.junit.Before; @@ -57,12 +55,8 @@ public class EmbeddedSolrConnectorTest { System.out.println("query solr"); long expResult = 1; - SolrDocumentList result; - try { - result = solr.getDocumentListByQuery(CollectionSchema.text_t.name() + ":tempor", 0, 10,""); - assertEquals(expResult, result.getNumFound()); - } catch (final IOException ex) { - fail("Solr query no result"); - } + long result = solr.getCountByQuery(CollectionSchema.text_t.name() + ":tempor"); + System.out.println("found = " + result + " (expected = 1 )"); + assertEquals(expResult, result); } } diff --git a/test/net/yacy/document/parser/htmlParserTest.java b/test/net/yacy/document/parser/htmlParserTest.java index 7df23d74e..9c0fafd93 100644 --- a/test/net/yacy/document/parser/htmlParserTest.java +++ b/test/net/yacy/document/parser/htmlParserTest.java @@ -5,11 +5,15 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.net.MalformedURLException; import java.nio.charset.Charset; +import java.util.List; +import static junit.framework.Assert.assertEquals; import static junit.framework.Assert.assertTrue; import junit.framework.TestCase; import net.yacy.cora.document.id.AnchorURL; import net.yacy.document.Document; import net.yacy.document.Parser; +import net.yacy.document.parser.html.ContentScraper; +import static net.yacy.document.parser.htmlParser.parseToScraper; import org.junit.Test; public class htmlParserTest extends TestCase { @@ -80,4 +84,35 @@ public class htmlParserTest extends TestCase { } } + + /** + * Test of parseToScraper method, of class htmlParser. + */ + @Test + public void testParseToScraper_4args() throws Exception { + // test link with inline html in text + // expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt + final AnchorURL url = new AnchorURL("http://localhost/"); + final String mimetype = "text/html"; + final String testhtml = "" + + "testtext" // "testtext" + + " Start" // "Start" + + "" // "" + image + + ""; + + ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10); + List anchorlist = scraper.getAnchors(); + + String linktxt = anchorlist.get(0).getTextProperty(); + assertEquals("testtext", linktxt); + + linktxt = anchorlist.get(1).getTextProperty(); + assertEquals("Start", linktxt); + + linktxt = anchorlist.get(2).getTextProperty(); + assertEquals("", linktxt); + + int cnt = scraper.getImages().size(); + assertEquals(1,cnt); + } }