exclude html tags in in/outboundlinks_anchortext_txt parsed text

- some outboundlinks_anchortext_txt in index contain e.g. <span>text</span> or more tags,
remove all tags for text property (inline img tags are still parsed)
- added test case for above (to htmlParserTest)
- fix solr test case
pull/1/head
reger 11 years ago
parent 469e0a62f1
commit 86f6975edc

@ -520,7 +520,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", new String(tag.content.getChars()));
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
url.setAll(tag.opts);
recursiveParse(url, tag.content.getChars());

@ -5,8 +5,6 @@ import java.io.IOException;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.junit.After;
import org.junit.Before;
@ -57,12 +55,8 @@ public class EmbeddedSolrConnectorTest {
System.out.println("query solr");
long expResult = 1;
SolrDocumentList result;
try {
result = solr.getDocumentListByQuery(CollectionSchema.text_t.name() + ":tempor", 0, 10,"");
assertEquals(expResult, result.getNumFound());
} catch (final IOException ex) {
fail("Solr query no result");
}
long result = solr.getCountByQuery(CollectionSchema.text_t.name() + ":tempor");
System.out.println("found = " + result + " (expected = 1 )");
assertEquals(expResult, result);
}
}

@ -5,11 +5,15 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.util.List;
import static junit.framework.Assert.assertEquals;
import static junit.framework.Assert.assertTrue;
import junit.framework.TestCase;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.ContentScraper;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import org.junit.Test;
public class htmlParserTest extends TestCase {
@ -80,4 +84,35 @@ public class htmlParserTest extends TestCase {
}
}
/**
* Test of parseToScraper method, of class htmlParser.
*/
@Test
public void testParseToScraper_4args() throws Exception {
// test link with inline html in text
// expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String testhtml = "<html><bod>"
+ "<a href='x1.html'><span>testtext</span></a>" // "testtext"
+ "<a href=\"http://localhost/x2.html\"> <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
+ "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // "" + image
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
List<AnchorURL> anchorlist = scraper.getAnchors();
String linktxt = anchorlist.get(0).getTextProperty();
assertEquals("testtext", linktxt);
linktxt = anchorlist.get(1).getTextProperty();
assertEquals("Start", linktxt);
linktxt = anchorlist.get(2).getTextProperty();
assertEquals("", linktxt);
int cnt = scraper.getImages().size();
assertEquals(1,cnt);
}
}

Loading…
Cancel
Save