fix for image alt attachment to AnchorURLs in html parser.

pull/1/head
Michael Peter Christen 11 years ago
parent 3dcfc717eb
commit 98f45c9032

@ -327,7 +327,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0)); i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0); dark = (i % 2 == 0);
final Map<AnchorURL, ImageEntry> ts = document.getImages(); final Map<DigestURL, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator(); final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry; ImageEntry entry;
while (tsi.hasNext()) { while (tsi.hasNext()) {

@ -28,46 +28,68 @@ public class AnchorURL extends DigestURL {
private static final long serialVersionUID = 1586579902179962086L; private static final long serialVersionUID = 1586579902179962086L;
private String nameProperty, textProperty, relProperty, hrefProperty; // may contain additional url properties, such as given in html a href-links private String nameProperty, relProperty, hrefProperty, textBody; // may contain additional url properties, such as given in html a href-links
private DigestURL imageURL; // in case that the anchor contains an image link, store image url; if there is no image then set this to null
private String imageAlt; // in case that the anchor contains an image link, store the alt property; if there is no image then set this to null
public AnchorURL(final String url) throws MalformedURLException { public AnchorURL(final String url) throws MalformedURLException {
super(url); super(url);
this.textBody = "";
this.nameProperty = ""; this.nameProperty = "";
this.textProperty = "";
this.relProperty = ""; this.relProperty = "";
this.hrefProperty = ""; this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
} }
public AnchorURL(final AnchorURL url) { public AnchorURL(final AnchorURL url) {
super(url, url.hash()); super(url, url.hash());
this.textBody = url.textBody;
this.nameProperty = url.nameProperty; this.nameProperty = url.nameProperty;
this.textProperty = url.textProperty;
this.relProperty = url.relProperty; this.relProperty = url.relProperty;
this.hrefProperty = url.hrefProperty; this.hrefProperty = url.hrefProperty;
this.imageURL = url.imageURL;
this.imageAlt = url.imageAlt;
} }
public AnchorURL(final DigestURL url) { public AnchorURL(final DigestURL url) {
super(url, url.hash()); super(url, url.hash());
this.textBody = "";
this.nameProperty = ""; this.nameProperty = "";
this.textProperty = "";
this.relProperty = ""; this.relProperty = "";
this.hrefProperty = ""; this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
} }
public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException { public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath); super(baseURL, relPath);
this.textBody = "";
this.nameProperty = ""; this.nameProperty = "";
this.textProperty = "";
this.relProperty = ""; this.relProperty = "";
this.hrefProperty = ""; this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
} }
public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException { public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path); super(protocol, host, port, path);
this.textBody = "";
this.nameProperty = ""; this.nameProperty = "";
this.textProperty = "";
this.relProperty = ""; this.relProperty = "";
this.hrefProperty = ""; this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
}
public AnchorURL(final String protocol, final String host, final int port, final String path, final DigestURL imageURL, final String imageAlt) throws MalformedURLException {
super(protocol, host, port, path);
this.textBody = "";
this.nameProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = imageURL;
this.imageAlt = imageAlt;
} }
public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException { public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException {
@ -96,11 +118,11 @@ public class AnchorURL extends DigestURL {
} }
public String getTextProperty() { public String getTextProperty() {
return textProperty; return textBody;
} }
public void setTextProperty(String text) { public void setTextProperty(String text) {
this.textProperty = text; this.textBody = text;
} }
public String getRelProperty() { public String getRelProperty() {
@ -119,9 +141,25 @@ public class AnchorURL extends DigestURL {
this.hrefProperty = href; this.hrefProperty = href;
} }
public DigestURL getImageURL() {
return imageURL;
}
public void setImageURL(DigestURL imageURL) {
this.imageURL = imageURL;
}
public String getImageAlt() {
return imageAlt;
}
public void setImageAlt(String imageAlt) {
this.imageAlt = imageAlt;
}
public void setAll(final Properties tagopts) { public void setAll(final Properties tagopts) {
this.nameProperty = tagopts.getProperty("name", ""); this.nameProperty = tagopts.getProperty("name", "");
this.textProperty = tagopts.getProperty("text", ""); this.textBody = tagopts.getProperty("text", "");
this.relProperty = tagopts.getProperty("rel", ""); this.relProperty = tagopts.getProperty("rel", "");
this.hrefProperty = tagopts.getProperty("href", ""); this.hrefProperty = tagopts.getProperty("href", "");
} }
@ -129,7 +167,7 @@ public class AnchorURL extends DigestURL {
public Properties getAll() { public Properties getAll() {
final Properties tagopts = new Properties(); final Properties tagopts = new Properties();
tagopts.setProperty("name", this.nameProperty); tagopts.setProperty("name", this.nameProperty);
tagopts.setProperty("text", this.textProperty); tagopts.setProperty("text", this.textBody);
tagopts.setProperty("rel", this.relProperty); tagopts.setProperty("rel", this.relProperty);
tagopts.setProperty("href", this.hrefProperty); tagopts.setProperty("href", this.hrefProperty);
return tagopts; return tagopts;
@ -143,7 +181,7 @@ public class AnchorURL extends DigestURL {
return "<a href=\"" + this.toNormalform(false) + "\"" + return "<a href=\"" + this.toNormalform(false) + "\"" +
(this.nameProperty.length() > 0 ? (" name=\"" + this.nameProperty + "\"") : "") + (this.nameProperty.length() > 0 ? (" name=\"" + this.nameProperty + "\"") : "") +
(this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") + (this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") +
">" + this.textProperty + "</a>"; ">" + this.textBody + "</a>";
} }
@Override @Override

@ -31,7 +31,6 @@ import java.util.Queue;
import java.util.Set; import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.storage.SizeLimitedSet; import net.yacy.cora.storage.SizeLimitedSet;
@ -62,7 +61,7 @@ public class ResultImages {
if (MemoryControl.shortStatus()) clearQueues(); if (MemoryControl.shortStatus()) clearQueues();
limitQueues(1000); limitQueues(1000);
final Map<AnchorURL, ImageEntry> images = document.getImages(); final Map<DigestURL, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) { for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup // do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (image == null || image.url() == null) continue; if (image == null || image.url() == null) continue;

@ -80,7 +80,7 @@ public class Document {
private Object text; // the clear text, all that is visible private Object text; // the clear text, all that is visible
private final Collection<AnchorURL> anchors; // all links embedded as clickeable entities (anchor tags) private final Collection<AnchorURL> anchors; // all links embedded as clickeable entities (anchor tags)
private final LinkedHashMap<DigestURL, String> rss; // all embedded rss feeds private final LinkedHashMap<DigestURL, String> rss; // all embedded rss feeds
private final LinkedHashMap<AnchorURL, ImageEntry> images; // all visible pictures in document private final LinkedHashMap<DigestURL, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings. // the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative // The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags. // text in image tags.
@ -108,7 +108,7 @@ public class Document {
final Object text, final Object text,
final Collection<AnchorURL> anchors, final Collection<AnchorURL> anchors,
final LinkedHashMap<DigestURL, String> rss, final LinkedHashMap<DigestURL, String> rss,
final LinkedHashMap<AnchorURL, ImageEntry> images, final LinkedHashMap<DigestURL, ImageEntry> images,
final boolean indexingDenied, final boolean indexingDenied,
final Date date) { final Date date) {
this.source = location; this.source = location;
@ -132,7 +132,7 @@ public class Document {
} }
this.anchors = (anchors == null) ? new ArrayList<AnchorURL>(0) : anchors; this.anchors = (anchors == null) ? new ArrayList<AnchorURL>(0) : anchors;
this.rss = (rss == null) ? new LinkedHashMap<DigestURL, String>(0) : rss; this.rss = (rss == null) ? new LinkedHashMap<DigestURL, String>(0) : rss;
this.images = (images == null) ? new LinkedHashMap<AnchorURL, ImageEntry>() : images; this.images = (images == null) ? new LinkedHashMap<DigestURL, ImageEntry>() : images;
this.publisher = publisher; this.publisher = publisher;
this.hyperlinks = null; this.hyperlinks = null;
this.audiolinks = null; this.audiolinks = null;
@ -458,7 +458,7 @@ dc_rights
return this.videolinks; return this.videolinks;
} }
public LinkedHashMap<AnchorURL, ImageEntry> getImages() { public LinkedHashMap<DigestURL, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document) // returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection // this resturns a htmlFilterImageEntry collection
if (!this.resorted) resortLinks(); if (!this.resorted) resortLinks();
@ -505,7 +505,7 @@ dc_rights
this.applinks = new LinkedHashMap<AnchorURL, String>(); this.applinks = new LinkedHashMap<AnchorURL, String>();
this.emaillinks = new LinkedHashMap<String, String>(); this.emaillinks = new LinkedHashMap<String, String>();
final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<AnchorURL, ImageEntry> entry: this.images.entrySet()) { for (final Map.Entry<DigestURL, ImageEntry> entry: this.images.entrySet()) {
if (entry.getKey() != null && entry.getKey().getHost() != null && entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image"); if (entry.getKey() != null && entry.getKey().getHost() != null && entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
} }
for (final AnchorURL url: this.anchors) { for (final AnchorURL url: this.anchors) {
@ -629,13 +629,14 @@ dc_rights
int pos; int pos;
loop: while (i.hasNext()) loop: while (i.hasNext())
try { try {
url = null;
o = i.next(); o = i.next();
if (o instanceof AnchorURL) if (o instanceof AnchorURL)
url = (AnchorURL) o; url = (AnchorURL) o;
else if (o instanceof String) else if (o instanceof String)
url = new AnchorURL((String) o); url = new AnchorURL((String) o);
else if (o instanceof ImageEntry) else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url(); url = new AnchorURL(((ImageEntry) o).url());
else { else {
assert false; assert false;
continue loop; continue loop;
@ -815,13 +816,13 @@ dc_rights
final StringBuilder authors = new StringBuilder(80); final StringBuilder authors = new StringBuilder(80);
final StringBuilder publishers = new StringBuilder(80); final StringBuilder publishers = new StringBuilder(80);
final StringBuilder subjects = new StringBuilder(80); final StringBuilder subjects = new StringBuilder(80);
final List<String> descriptions = new ArrayList<String>(); final List<String> descriptions = new ArrayList<>();
final Collection<String> titles = new LinkedHashSet<String>(); final Collection<String> titles = new LinkedHashSet<>();
final Collection<String> sectionTitles = new LinkedHashSet<String>(); final Collection<String> sectionTitles = new LinkedHashSet<>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>(); final List<AnchorURL> anchors = new ArrayList<>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>(); final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>(); final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<>();
final Set<String> languages = new HashSet<String>(); final Set<String> languages = new HashSet<>();
double lon = 0.0d, lat = 0.0d; double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false; boolean indexingDenied = false;
Date date = new Date(); Date date = new Date();

@ -368,7 +368,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String src = tag.opts.getProperty("src", EMPTY_STRING); final String src = tag.opts.getProperty("src", EMPTY_STRING);
try { try {
if (src.length() > 0) { if (src.length() > 0) {
final AnchorURL url = absolutePath(src); final DigestURL url = absolutePath(src);
if (url != null) { if (url != null) {
// use Numberformat.parse to allow parse of "550px" // use Numberformat.parse to allow parse of "550px"
NumberFormat intnum = NumberFormat.getIntegerInstance (); NumberFormat intnum = NumberFormat.getIntegerInstance ();
@ -618,10 +618,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.anchors.add(entry); this.anchors.add(entry);
} }
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars()))); String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
StringBuilder altakk = new StringBuilder();
for (ImageEntry ie: scraper.images) { for (ImageEntry ie: scraper.images) {
if (linkurl != null) { if (linkurl != null) {
if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' ');
linkurl.setImageURL(ie.url());
AnchorURL a = new AnchorURL(linkurl); AnchorURL a = new AnchorURL(linkurl);
a.setTextProperty(line); a.setTextProperty(line);
a.setImageAlt(ie.alt());
a.setImageURL(ie.url());
ie.setLinkurl(a); ie.setLinkurl(a);
} }
// this image may have been added recently from the same location (as this is a recursive parse) // this image may have been added recently from the same location (as this is a recursive parse)
@ -631,6 +636,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
this.images.add(ie); this.images.add(ie);
} }
if (linkurl != null) {
linkurl.setImageAlt(altakk.toString().trim());
}
scraper.close(); scraper.close();
return line; return line;

@ -27,10 +27,11 @@ package net.yacy.document.parser.html;
import java.util.Comparator; import java.util.Comparator;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> { public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
private final AnchorURL imageurl; private final DigestURL imageurl;
private AnchorURL linkurl; private AnchorURL linkurl;
private final String alt; private final String alt;
private final int width, height; private final int width, height;
@ -48,7 +49,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
* @param fileSize the number of bytes that the image uses on file or -1 if unknown * @param fileSize the number of bytes that the image uses on file or -1 if unknown
*/ */
public ImageEntry( public ImageEntry(
final AnchorURL imageurl, final DigestURL imageurl,
final String alt, final String alt,
final int width, final int width,
final int height, final int height,
@ -62,7 +63,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
this.fileSize = fileSize; this.fileSize = fileSize;
} }
public AnchorURL url() { public DigestURL url() {
return this.imageurl; return this.imageurl;
} }

@ -123,7 +123,7 @@ public class htmlParser extends AbstractParser implements Parser {
sections[p++] = headline; sections[p++] = headline;
} }
} }
LinkedHashMap<AnchorURL, ImageEntry> noDoubleImages = new LinkedHashMap<AnchorURL, ImageEntry>(); LinkedHashMap<DigestURL, ImageEntry> noDoubleImages = new LinkedHashMap<>();
for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie); for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie);
final Document ppd = new Document( final Document ppd = new Document(
location, location,

@ -46,6 +46,7 @@ import java.util.Set;
import javax.imageio.ImageIO; import javax.imageio.ImageIO;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
@ -200,10 +201,10 @@ public class genericImageParser extends AbstractParser implements Parser {
final HashSet<String> languages = new HashSet<String>(); final HashSet<String> languages = new HashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>(); final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>(); final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<>();
// add this image to the map of images // add this image to the map of images
final String infoString = ii.info.toString(); final String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); images.put(ii.location, new ImageEntry(ii.location, "", ii.width, ii.height, -1));
if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(filename); if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(filename);

@ -38,6 +38,7 @@ import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -98,7 +99,7 @@ public class rssParser extends AbstractParser implements Parser {
null, null,
anchors, anchors,
null, null,
new LinkedHashMap<AnchorURL, ImageEntry>(), new LinkedHashMap<DigestURL, ImageEntry>(),
false, false,
item.getPubDate()); item.getPubDate());
docs.add(doc); docs.add(doc);

@ -96,7 +96,7 @@ public class sitemapParser extends AbstractParser implements Parser {
null, null,
null, null,
null, null,
new LinkedHashMap<AnchorURL, ImageEntry>(), new LinkedHashMap<DigestURL, ImageEntry>(),
false, false,
new Date()); new Date());
docs.add(doc); docs.add(doc);

@ -571,7 +571,7 @@ public class Segment {
} }
} }
// media links as well! // media links as well!
for (AnchorURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime())); for (DigestURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime())); for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime())); for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime()));
} catch (Throwable e) { } catch (Throwable e) {

@ -109,15 +109,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
for (final AnchorURL target_url: links) { for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge( SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, processTypes, subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, target_url, null); sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
edges.add(edge);
}
for (final ImageEntry image_url: images) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt());
target_order++; target_order++;
// add the edge to the subgraph // add the edge to the subgraph
edges.add(edge); edges.add(edge);
@ -128,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public SolrInputDocument getEdge( public SolrInputDocument getEdge(
final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections, final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections,
int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order, int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order,
AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) { AnchorURL target_url) {
final String name = target_url.getNameProperty(); // the name attribute final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag final String text = target_url.getTextProperty(); // the text between the <a></a> tag
@ -219,10 +211,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length()); if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0); if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0);
if (targetImageAlt != null) { if (target_url.getImageAlt() != null) {
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt); if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, target_url.getImageAlt());
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length()); if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, target_url.getImageAlt().length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0); if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, target_url.getImageAlt().length() > 0 ? CommonPattern.SPACE.split(target_url.getImageAlt()).length : 0);
} }
// add the target attributes // add the target attributes

Loading…
Cancel
Save