fix contentscraper img height/width parsing

prevent numberformat exception on common "100px" property

- include in test case
pull/1/head
reger 11 years ago
parent 4e57000a40
commit 0b6db04e40

@ -31,6 +31,7 @@ import java.io.IOException;
import java.io.Writer;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Date;
@ -366,13 +367,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
if (url != null) {
final int width = Integer.parseInt(tag.opts.getProperty("width", "-1"));
final int height = Integer.parseInt(tag.opts.getProperty("height", "-1"));
// use Numberformat.parse to allow parse of "550px"
NumberFormat intnum = NumberFormat.getIntegerInstance ();
final int width = intnum.parse(tag.opts.getProperty("width", "-1")).intValue(); // Integer.parseInt fails on "200px"
final int height = intnum.parse(tag.opts.getProperty("height", "-1")).intValue();
final ImageEntry ie = new ImageEntry(url, tag.opts.getProperty("alt", EMPTY_STRING), width, height, -1);
this.images.add(ie);
}
}
} catch (final NumberFormatException e) {}
} catch (final ParseException e) {}
this.evaluationScores.match(Element.imgpath, src);
} else if(tag.name.equalsIgnoreCase("base")) {
try {

@ -10,7 +10,7 @@ public class DigestURLTest extends TestCase {
public void testIdentPort() throws MalformedURLException {
String[][] testStrings = new String[][]{
new String[]{"http://www.yacy.net:", "http://www.yacy.net/"},
new String[]{"http://www.yacy.net:-1", "http://www.yacy.net/"},
new String[]{"http://www.yacy.net:80", "http://www.yacy.net/"},
new String[]{"http://www.yacy.net:/", "http://www.yacy.net/"},
new String[]{"http://www.yacy.net: /", "http://www.yacy.net/"}
};

@ -13,6 +13,7 @@ import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.ImageEntry;
import static net.yacy.document.parser.htmlParser.parseToScraper;
import org.junit.Test;
@ -94,10 +95,11 @@ public class htmlParserTest extends TestCase {
// expectation to deliver pure text as it is possibly indexed in outboundlinks_anchortext_txt/inboundlinks_anchortext_txt
final AnchorURL url = new AnchorURL("http://localhost/");
final String mimetype = "text/html";
final String testhtml = "<html><bod>"
final String testhtml = "<html><body>"
+ "<a href='x1.html'><span>testtext</span></a>" // "testtext"
+ "<a href=\"http://localhost/x2.html\"> <i id=\"home-icon\" class=\"img-sprite\"></i>Start</a>" // "Start"
+ "<a href='x1.html'><span class='button'><img src='pic.gif'/></span></a>" // "" + image
+ "<figure><img width=\"550px\" title=\"image as exemple\" alt=\"image as exemple\" src=\"./img/my_image.png\"></figrue>" // + img width 550 (+html5 figure)
+ "</body></html>";
ContentScraper scraper = parseToScraper(url, mimetype, testhtml, 10);
@ -113,6 +115,8 @@ public class htmlParserTest extends TestCase {
assertEquals("", linktxt);
int cnt = scraper.getImages().size();
assertEquals(1,cnt);
assertEquals(2,cnt);
ImageEntry img = scraper.getImages().get(1);
assertEquals(550,img.width());
}
}

Loading…
Cancel
Save