Fixed scraper NullPointerException cases on malformed URLs.

8 years ago · 306a82dd71
parent aa55d71cf5
commit 306a82dd71
1 changed files with 12 additions and 8 deletions
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -630,17 +630,21 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            final String name = tag.opts.getProperty("name", EMPTY_STRING);
            if (name.equalsIgnoreCase("movie")) {
                AnchorURL url = absolutePath(tag.opts.getProperty("value", EMPTY_STRING));
-                tag.opts.put("value", url.toNormalform(true));
-                url.setAll(tag.opts);
-                this.addAnchor(url);
+                if(url != null) {
+                	tag.opts.put("value", url.toNormalform(true));
+                	url.setAll(tag.opts);
+                	this.addAnchor(url);
+                }
            }
        } else if (tag.name.equalsIgnoreCase("iframe")) {
            final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
-            tag.opts.put("src", src.toNormalform(true));
-            src.setAll(tag.opts);
-            //this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
-            this.iframes.add(src);
-            this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
+            if(src != null) {
+            	tag.opts.put("src", src.toNormalform(true));
+            	src.setAll(tag.opts);
+            	// this.addAnchor(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
+            	this.iframes.add(src);
+            	this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
+            }
        } else if (tag.name.equalsIgnoreCase("html")) {
            final String lang = tag.opts.getProperty("lang", EMPTY_STRING);
            if (!lang.isEmpty()) // fake a language meta to preserv detection from <html lang="xx" />