fix contentscraper img height/width parsing

prevent numberformat exception on common "100px" property - include in test case
11 years ago · 0b6db04e40
parent 4e57000a40
commit 0b6db04e40
3 changed files with 13 additions and 6 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -31,6 +31,7 @@ import java.io.IOException;
 import java.io.Writer;
 import java.net.MalformedURLException;
 import java.nio.charset.Charset;
+import java.text.NumberFormat;
 import java.text.ParseException;
 import java.util.ArrayList;
 import java.util.Date;
@ -366,13 +367,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                if (src.length() > 0) {
                    final AnchorURL url = absolutePath(src);
                    if (url != null) {
-                        final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
-                        final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
+                        // use Numberformat.parse to allow parse of "550px"
+                        NumberFormat intnum = NumberFormat.getIntegerInstance ();
+                        final int width = intnum.parse(tag.opts.getProperty("width", "-1")).intValue(); // Integer.parseInt fails on "200px"
+                        final int height = intnum.parse(tag.opts.getProperty("height", "-1")).intValue();
                        final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
                        this.images.add(ie);
                    }
                }
-            } catch (final NumberFormatException e) {}
+            } catch (final ParseException e) {}
            this.evaluationScores.match(Element.imgpath, src);
        } else if(tag.name.equalsIgnoreCase("base")) {
            try {
--- a/test/net/yacy/cora/document/id/DigestURLTest.java
+++ b/test/net/yacy/cora/document/id/DigestURLTest.java
@ -10,7 +10,7 @@ public class DigestURLTest extends TestCase {
    public void testIdentPort() throws MalformedURLException {
        String[][] testStrings = new String[][]{
            new String[]{"http://www.yacy.net:", "http://www.yacy.net/"},
-            new String[]{"http://www.yacy.net:-1", "http://www.yacy.net/"},
+            new String[]{"http://www.yacy.net:80", "http://www.yacy.net/"},
            new String[]{"http://www.yacy.net:/", "http://www.yacy.net/"},
            new String[]{"http://www.yacy.net: /", "http://www.yacy.net/"}
        };
--- a/test/net/yacy/document/parser/htmlParserTest.java
+++ b/test/net/yacy/document/parser/htmlParserTest.java
@ -13,6 +13,7 @@ import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.document.Document;
 import net.yacy.document.Parser;
 import net.yacy.document.parser.html.ContentScraper;
+import net.yacy.document.parser.html.ImageEntry;
 import static net.yacy.document.parser.htmlParser.parseToScraper;
 import org.junit.Test;

@ -94,10 +95,11 @@ public class htmlParserTest extends TestCase {
        // expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
        final AnchorURL url = new AnchorURL("http://localhost/");
        final String mimetype = "text/html";
-        final String testhtml = "<html><bod>"
+        final String testhtml = "<html><body>"
                + "<a href='x1.html'><span>testtext</span></a>" // "testtext"
                + "<a href=\"http://localhost/x2.html\">   <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
                + "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // ""  + image
+                + "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
                + "</body></html>";

        ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
@ -113,6 +115,8 @@ public class htmlParserTest extends TestCase {
        assertEquals("", linktxt);

        int cnt = scraper.getImages().size();
-        assertEquals(1,cnt);
+        assertEquals(2,cnt);
+        ImageEntry img = scraper.getImages().get(1);
+        assertEquals(550,img.width());
    }
 }