diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 83b75d9ad..602f1c535 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -274,10 +274,10 @@ public class ContentScraper extends AbstractScraper implements Scraper { public void scrapeTag0(final String tagname, final Properties tagopts) { if (tagname.equalsIgnoreCase("img")) { + String src = tagopts.getProperty("src", ""); try { final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); - String src = tagopts.getProperty("src", ""); if (src.length() > 0) { final MultiProtocolURI url = absolutePath(src); if (url != null) { @@ -286,6 +286,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } } catch (final NumberFormatException e) {} + Evaluation.match(Element.imgpath, src, this.evaluationScores); } else if(tagname.equalsIgnoreCase("base")) { try { root = new MultiProtocolURI(tagopts.getProperty("href", "")); @@ -293,9 +294,6 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if (tagname.equalsIgnoreCase("frame")) { anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); frames.add(absolutePath(tagopts.getProperty("src", ""))); - } else if (tagname.equalsIgnoreCase("iframe")) { - anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); - iframes.add(absolutePath(tagopts.getProperty("src", ""))); } else if (tagname.equalsIgnoreCase("body")) { String c = tagopts.getProperty("class", ""); Evaluation.match(Element.bodyclass, c, this.evaluationScores); @@ -376,6 +374,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { anchors.put(url, tagopts); } } + Evaluation.match(Element.apath, href, this.evaluationScores); } final String h; if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) { @@ -410,6 +409,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { } else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) { h = recursiveParse(text); if (h.length() > 0) li.add(h); + } else if (tagname.equalsIgnoreCase("iframe")) { + String src = tagopts.getProperty("src", ""); + anchors.put(absolutePath(src), tagopts /* with property "name" */); + iframes.add(absolutePath(src)); + Evaluation.match(Element.iframepath, src, this.evaluationScores); } else if (tagname.equalsIgnoreCase("script")) { String src = tagopts.getProperty("src", ""); if (src.length() > 0) { diff --git a/source/net/yacy/document/parser/html/Evaluation.java b/source/net/yacy/document/parser/html/Evaluation.java index 431b6ec58..8dbe441bb 100644 --- a/source/net/yacy/document/parser/html/Evaluation.java +++ b/source/net/yacy/document/parser/html/Evaluation.java @@ -68,6 +68,9 @@ public class Evaluation { url, scriptpath, scriptcode, + iframepath, + imgpath, + apath, comment; }