do not write iframe and embed links into webgraph, but use them anyway

for crawling
pull/1/head
Michael Peter Christen 10 years ago
parent 5b810f6d70
commit d2792a43fd

@ -62,6 +62,7 @@ import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.EmbedEntry;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.util.FileUtils;
@ -912,6 +913,8 @@ dc_rights
}
public final static String CANONICAL_MARKER = "canonical";
public final static String IFRAME_MARKER = "iframe";
public final static String EMBED_MARKER = "embed";
public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
final Map<AnchorURL, String> result = new HashMap<>();
@ -932,6 +935,8 @@ dc_rights
if (canonical != null) {
result.put(canonical, CANONICAL_MARKER);
}
for (AnchorURL u: html.getIFrames()) result.put(u, IFRAME_MARKER);
for (AnchorURL u: html.getEmbeds().keySet()) result.put(u, EMBED_MARKER);
}
}
return result;

@ -170,9 +170,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// class variables: collectors for links
private final List<AnchorURL> anchors;
private final LinkedHashMap<DigestURL, String> rss, css;
private final LinkedHashMap<DigestURL, EmbedEntry> embeds; // urlhash/embed relation
private final LinkedHashMap<AnchorURL, EmbedEntry> embeds; // urlhash/embed relation
private final List<ImageEntry> images;
private final Set<DigestURL> script, frames, iframes;
private final Set<AnchorURL> script, frames, iframes;
private final Map<String, String> metas;
private final Map<String, DigestURL> hreflang, navigation;
private LinkedHashSet<String> titles;
@ -216,13 +216,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.css = new SizeLimitedMap<DigestURL, String>(maxLinks);
this.anchors = new ArrayList<AnchorURL>();
this.images = new ArrayList<ImageEntry>();
this.embeds = new SizeLimitedMap<DigestURL, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<DigestURL>(maxLinks);
this.iframes = new SizeLimitedSet<DigestURL>(maxLinks);
this.embeds = new SizeLimitedMap<AnchorURL, EmbedEntry>(maxLinks);
this.frames = new SizeLimitedSet<AnchorURL>(maxLinks);
this.iframes = new SizeLimitedSet<AnchorURL>(maxLinks);
this.metas = new SizeLimitedMap<String, String>(maxLinks);
this.hreflang = new SizeLimitedMap<String, DigestURL>(maxLinks);
this.navigation = new SizeLimitedMap<String, DigestURL>(maxLinks);
this.script = new SizeLimitedSet<DigestURL>(maxLinks);
this.script = new SizeLimitedSet<AnchorURL>(maxLinks);
this.titles = new LinkedHashSet<String>();
this.headlines = (List<String>[]) Array.newInstance(ArrayList.class, 6);
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
@ -478,7 +478,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final EmbedEntry ie = new EmbedEntry(url, width, height, tag.opts.getProperty("type", EMPTY_STRING), tag.opts.getProperty("pluginspage", EMPTY_STRING));
this.embeds.put(url, ie);
url.setAll(tag.opts);
this.anchors.add(url);
// this.anchors.add(url); // don't add the embed to the anchors because the webgraph should not contain such links (by definition)
}
}
} catch (final NumberFormatException e) {}
@ -494,7 +494,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final AnchorURL src = absolutePath(tag.opts.getProperty("src", EMPTY_STRING));
tag.opts.put("src", src.toNormalform(true));
src.setAll(tag.opts);
this.anchors.add(src);
//this.anchors.add(src); // don't add the iframe to the anchors because the webgraph should not contain such links (by definition)
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true));
} else if (tag.name.equalsIgnoreCase("html")) {
@ -766,17 +766,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.css;
}
public Set<DigestURL> getFrames() {
public Set<AnchorURL> getFrames() {
// returns a url (String) / name (String) relation
return this.frames;
}
public Set<DigestURL> getIFrames() {
public Set<AnchorURL> getIFrames() {
// returns a url (String) / name (String) relation
return this.iframes;
}
public Set<DigestURL> getScript() {
public Set<AnchorURL> getScript() {
return this.script;
}
@ -804,7 +804,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.images;
}
public Map<DigestURL, EmbedEntry> getEmbeds() {
public Map<AnchorURL, EmbedEntry> getEmbeds() {
return this.embeds;
}

@ -732,10 +732,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// Scripts
if (allAttr || contains(CollectionSchema.scripts_sxt)) {
final Set<DigestURL> scriptss = html.getScript();
final Set<AnchorURL> scriptss = html.getScript();
final String[] scripts = new String[scriptss.size()];
c = 0;
for (final DigestURL u: scriptss) {
for (final AnchorURL u: scriptss) {
inboundLinks.remove(u);
outboundLinks.remove(u);
scripts[c++] = u.toNormalform(false);
@ -746,10 +746,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// Frames
if (allAttr || contains(CollectionSchema.frames_sxt)) {
final Set<DigestURL> framess = html.getFrames();
final Set<AnchorURL> framess = html.getFrames();
final String[] frames = new String[framess.size()];
c = 0;
for (final DigestURL u: framess) {
for (final AnchorURL u: framess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
frames[c++] = u.toNormalform(false);
@ -763,10 +763,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// IFrames
if (allAttr || contains(CollectionSchema.iframes_sxt)) {
final Set<DigestURL> iframess = html.getIFrames();
final Set<AnchorURL> iframess = html.getIFrames();
final String[] iframes = new String[iframess.size()];
c = 0;
for (final DigestURL u: iframess) {
for (final AnchorURL u: iframess) {
inboundLinks.remove(u);
outboundLinks.remove(u);
iframes[c++] = u.toNormalform(false);

Loading…
Cancel
Save