exclude html tags in in/outboundlinks_anchortext_txt parsed text

- some outboundlinks_anchortext_txt in index contain e.g. <span>text</span> or more tags, remove all tags for text property (inline img tags are still parsed) - added test case for above (to htmlParserTest) - fix solr test case
11 years ago · 86f6975edc
parent 469e0a62f1
commit 86f6975edc
3 changed files with 39 additions and 10 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -520,7 +520,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                        if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
                        tag.opts.put("rel", rel);
                    }
-                    tag.opts.put("text", new String(tag.content.getChars()));
+                    tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like  "<a ...> <span>test</span> </a>"
                    tag.opts.put("href", url.toNormalform(true)); // we must assign this because the url may have resolved backpaths and may not be absolute
                    url.setAll(tag.opts);
                    recursiveParse(url, tag.content.getChars());
--- a/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java
+++ b/test/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnectorTest.java
@ -5,8 +5,6 @@ import java.io.IOException;
 import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
 import net.yacy.search.schema.CollectionSchema;
 import net.yacy.search.schema.WebgraphSchema;
-import org.apache.solr.common.SolrDocumentList;
-import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.junit.After;
 import org.junit.Before;
@ -57,12 +55,8 @@ public class EmbeddedSolrConnectorTest {

        System.out.println("query solr");
        long expResult = 1;
-        SolrDocumentList result;
-        try {
-            result = solr.getDocumentListByQuery(CollectionSchema.text_t.name() + ":tempor", 0, 10,"");
-            assertEquals(expResult, result.getNumFound());
-        } catch (final IOException ex) {
-            fail("Solr query no result");
-        }
+        long result = solr.getCountByQuery(CollectionSchema.text_t.name() + ":tempor");
+        System.out.println("found = " + result + " (expected = 1 )");
+        assertEquals(expResult, result);
    }
 }
--- a/test/net/yacy/document/parser/htmlParserTest.java
+++ b/test/net/yacy/document/parser/htmlParserTest.java
@ -5,11 +5,15 @@ import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
+import java.util.List;
+import static junit.framework.Assert.assertEquals;
 import static junit.framework.Assert.assertTrue;
 import junit.framework.TestCase;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
+import net.yacy.document.parser.html.ContentScraper;
+import static net.yacy.document.parser.htmlParser.parseToScraper;
 import org.junit.Test;

 public class htmlParserTest extends TestCase {
@ -80,4 +84,35 @@ public class htmlParserTest extends TestCase {

        }
    }
+
+    /**
+     * Test of parseToScraper method, of class htmlParser.
+     */
+    @Test
+    public void testParseToScraper_4args() throws Exception {
+        // test link with inline html in text
+        // expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
+        final AnchorURL url = new AnchorURL("http://localhost/");
+        final String mimetype = "text/html";
+        final String testhtml = "<html><bod>"
+                + "<a href='x1.html'><span>testtext</span></a>" // "testtext"
+                + "<a href=\"http://localhost/x2.html\">   <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
+                + "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // ""  + image
+                + "</body></html>";
+
+        ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
+        List<AnchorURL> anchorlist = scraper.getAnchors();
+
+        String linktxt = anchorlist.get(0).getTextProperty();
+        assertEquals("testtext", linktxt);
+
+        linktxt = anchorlist.get(1).getTextProperty();
+        assertEquals("Start", linktxt);
+
+        linktxt = anchorlist.get(2).getTextProperty();
+        assertEquals("", linktxt);
+
+        int cnt = scraper.getImages().size();
+        assertEquals(1,cnt);
+    }
 }