From 4144c7cc5226675a90a31e62d72b91802a3ef47a Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Jan 2015 14:14:25 +0100 Subject: [PATCH] do not write frame links to webgraph --- source/net/yacy/document/Document.java | 3 ++- source/net/yacy/document/parser/html/ContentScraper.java | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 59a641ba9..8d2524957 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -62,7 +62,6 @@ import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.document.parser.html.ContentScraper; -import net.yacy.document.parser.html.EmbedEntry; import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.util.FileUtils; @@ -914,6 +913,7 @@ dc_rights public final static String CANONICAL_MARKER = "canonical"; public final static String IFRAME_MARKER = "iframe"; + public final static String FRAME_MARKER = "frame"; public final static String EMBED_MARKER = "embed"; public static Map getHyperlinks(final Document[] documents, boolean includeNofollow) { @@ -935,6 +935,7 @@ dc_rights if (canonical != null) { result.put(canonical, CANONICAL_MARKER); } + for (AnchorURL u: html.getFrames()) result.put(u, FRAME_MARKER); for (AnchorURL u: html.getIFrames()) result.put(u, IFRAME_MARKER); for (AnchorURL u: html.getEmbeds().keySet()) result.put(u, EMBED_MARKER); } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 18ffdd6af..02ee617b4 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -388,7 +388,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING)); tag.opts.put("src", src.toNormalform(true)); src.setAll(tag.opts); - this.anchors.add(src); + //this.anchors.add(src); // don't add the frame to the anchors because the webgraph should not contain such links (by definition) this.frames.add(src); this.evaluationScores.match(Element.framepath, src.toNormalform(true)); } else if (tag.name.equalsIgnoreCase("body")) {