fix for image alt attachment to AnchorURLs in html parser.

pull/1/head
Michael Peter Christen 11 years ago
parent 3dcfc717eb
commit 98f45c9032

@ -327,7 +327,7 @@ public class ViewFile {
i += putMediaInfo(prop, wordArray, i, document.getAudiolinks(), "audio", (i % 2 == 0));
dark = (i % 2 == 0);
final Map<AnchorURL, ImageEntry> ts = document.getImages();
final Map<DigestURL, ImageEntry> ts = document.getImages();
final Iterator<ImageEntry> tsi = ts.values().iterator();
ImageEntry entry;
while (tsi.hasNext()) {

@ -28,46 +28,68 @@ public class AnchorURL extends DigestURL {
private static final long serialVersionUID = 1586579902179962086L;
private String nameProperty, textProperty, relProperty, hrefProperty; // may contain additional url properties, such as given in html a href-links
private String nameProperty, relProperty, hrefProperty, textBody; // may contain additional url properties, such as given in html a href-links
private DigestURL imageURL; // in case that the anchor contains an image link, store image url; if there is no image then set this to null
private String imageAlt; // in case that the anchor contains an image link, store the alt property; if there is no image then set this to null
public AnchorURL(final String url) throws MalformedURLException {
super(url);
this.textBody = "";
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
}
public AnchorURL(final AnchorURL url) {
super(url, url.hash());
this.textBody = url.textBody;
this.nameProperty = url.nameProperty;
this.textProperty = url.textProperty;
this.relProperty = url.relProperty;
this.hrefProperty = url.hrefProperty;
this.imageURL = url.imageURL;
this.imageAlt = url.imageAlt;
}
public AnchorURL(final DigestURL url) {
super(url, url.hash());
this.textBody = "";
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
}
public AnchorURL(final MultiProtocolURL baseURL, final String relPath) throws MalformedURLException {
super(baseURL, relPath);
this.textBody = "";
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
}
public AnchorURL(final String protocol, final String host, final int port, final String path) throws MalformedURLException {
super(protocol, host, port, path);
this.textBody = "";
this.nameProperty = "";
this.textProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = null;
this.imageAlt = null;
}
public AnchorURL(final String protocol, final String host, final int port, final String path, final DigestURL imageURL, final String imageAlt) throws MalformedURLException {
super(protocol, host, port, path);
this.textBody = "";
this.nameProperty = "";
this.relProperty = "";
this.hrefProperty = "";
this.imageURL = imageURL;
this.imageAlt = imageAlt;
}
public static AnchorURL newAnchor(final DigestURL baseURL, String relPath) throws MalformedURLException {
@ -96,11 +118,11 @@ public class AnchorURL extends DigestURL {
}
public String getTextProperty() {
return textProperty;
return textBody;
}
public void setTextProperty(String text) {
this.textProperty = text;
this.textBody = text;
}
public String getRelProperty() {
@ -119,9 +141,25 @@ public class AnchorURL extends DigestURL {
this.hrefProperty = href;
}
public DigestURL getImageURL() {
return imageURL;
}
public void setImageURL(DigestURL imageURL) {
this.imageURL = imageURL;
}
public String getImageAlt() {
return imageAlt;
}
public void setImageAlt(String imageAlt) {
this.imageAlt = imageAlt;
}
public void setAll(final Properties tagopts) {
this.nameProperty = tagopts.getProperty("name", "");
this.textProperty = tagopts.getProperty("text", "");
this.textBody = tagopts.getProperty("text", "");
this.relProperty = tagopts.getProperty("rel", "");
this.hrefProperty = tagopts.getProperty("href", "");
}
@ -129,7 +167,7 @@ public class AnchorURL extends DigestURL {
public Properties getAll() {
final Properties tagopts = new Properties();
tagopts.setProperty("name", this.nameProperty);
tagopts.setProperty("text", this.textProperty);
tagopts.setProperty("text", this.textBody);
tagopts.setProperty("rel", this.relProperty);
tagopts.setProperty("href", this.hrefProperty);
return tagopts;
@ -143,7 +181,7 @@ public class AnchorURL extends DigestURL {
return "<a href=\"" + this.toNormalform(false) + "\"" +
(this.nameProperty.length() > 0 ? (" name=\"" + this.nameProperty + "\"") : "") +
(this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") +
">" + this.textProperty + "</a>";
">" + this.textBody + "</a>";
}
@Override

@ -31,7 +31,6 @@ import java.util.Queue;
import java.util.Set;
import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.storage.SizeLimitedSet;
@ -62,7 +61,7 @@ public class ResultImages {
if (MemoryControl.shortStatus()) clearQueues();
limitQueues(1000);
final Map<AnchorURL, ImageEntry> images = document.getImages();
final Map<DigestURL, ImageEntry> images = document.getImages();
for (final ImageEntry image: images.values()) {
// do a double-check; attention: this can be time-consuming since this possibly needs a DNS-lookup
if (image == null || image.url() == null) continue;

@ -80,7 +80,7 @@ public class Document {
private Object text; // the clear text, all that is visible
private final Collection<AnchorURL> anchors; // all links embedded as clickeable entities (anchor tags)
private final LinkedHashMap<DigestURL, String> rss; // all embedded rss feeds
private final LinkedHashMap<AnchorURL, ImageEntry> images; // all visible pictures in document
private final LinkedHashMap<DigestURL, ImageEntry> images; // all visible pictures in document
// the anchors and images - Maps are URL-to-EntityDescription mappings.
// The EntityDescription appear either as visible text in anchors or as alternative
// text in image tags.
@ -108,7 +108,7 @@ public class Document {
final Object text,
final Collection<AnchorURL> anchors,
final LinkedHashMap<DigestURL, String> rss,
final LinkedHashMap<AnchorURL, ImageEntry> images,
final LinkedHashMap<DigestURL, ImageEntry> images,
final boolean indexingDenied,
final Date date) {
this.source = location;
@ -132,7 +132,7 @@ public class Document {
}
this.anchors = (anchors == null) ? new ArrayList<AnchorURL>(0) : anchors;
this.rss = (rss == null) ? new LinkedHashMap<DigestURL, String>(0) : rss;
this.images = (images == null) ? new LinkedHashMap<AnchorURL, ImageEntry>() : images;
this.images = (images == null) ? new LinkedHashMap<DigestURL, ImageEntry>() : images;
this.publisher = publisher;
this.hyperlinks = null;
this.audiolinks = null;
@ -458,7 +458,7 @@ dc_rights
return this.videolinks;
}
public LinkedHashMap<AnchorURL, ImageEntry> getImages() {
public LinkedHashMap<DigestURL, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!this.resorted) resortLinks();
@ -505,7 +505,7 @@ dc_rights
this.applinks = new LinkedHashMap<AnchorURL, String>();
this.emaillinks = new LinkedHashMap<String, String>();
final Map<AnchorURL, ImageEntry> collectedImages = new HashMap<AnchorURL, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (final Map.Entry<AnchorURL, ImageEntry> entry: this.images.entrySet()) {
for (final Map.Entry<DigestURL, ImageEntry> entry: this.images.entrySet()) {
if (entry.getKey() != null && entry.getKey().getHost() != null && entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
}
for (final AnchorURL url: this.anchors) {
@ -629,13 +629,14 @@ dc_rights
int pos;
loop: while (i.hasNext())
try {
url = null;
o = i.next();
if (o instanceof AnchorURL)
url = (AnchorURL) o;
else if (o instanceof String)
url = new AnchorURL((String) o);
else if (o instanceof ImageEntry)
url = ((ImageEntry) o).url();
url = new AnchorURL(((ImageEntry) o).url());
else {
assert false;
continue loop;
@ -815,13 +816,13 @@ dc_rights
final StringBuilder authors = new StringBuilder(80);
final StringBuilder publishers = new StringBuilder(80);
final StringBuilder subjects = new StringBuilder(80);
final List<String> descriptions = new ArrayList<String>();
final Collection<String> titles = new LinkedHashSet<String>();
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>();
final List<String> descriptions = new ArrayList<>();
final Collection<String> titles = new LinkedHashSet<>();
final Collection<String> sectionTitles = new LinkedHashSet<>();
final List<AnchorURL> anchors = new ArrayList<>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<>();
final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<>();
final Set<String> languages = new HashSet<>();
double lon = 0.0d, lat = 0.0d;
boolean indexingDenied = false;
Date date = new Date();

@ -368,7 +368,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String src = tag.opts.getProperty("src", EMPTY_STRING);
try {
if (src.length() > 0) {
final AnchorURL url = absolutePath(src);
final DigestURL url = absolutePath(src);
if (url != null) {
// use Numberformat.parse to allow parse of "550px"
NumberFormat intnum = NumberFormat.getIntegerInstance ();
@ -618,10 +618,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.anchors.add(entry);
}
String line = cleanLine(CharacterCoding.html2unicode(stripAllTags(scraper.content.getChars())));
StringBuilder altakk = new StringBuilder();
for (ImageEntry ie: scraper.images) {
if (linkurl != null) {
if (ie.alt() != null) altakk.append(ie.alt().trim()).append(' ');
linkurl.setImageURL(ie.url());
AnchorURL a = new AnchorURL(linkurl);
a.setTextProperty(line);
a.setImageAlt(ie.alt());
a.setImageURL(ie.url());
ie.setLinkurl(a);
}
// this image may have been added recently from the same location (as this is a recursive parse)
@ -631,6 +636,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
this.images.add(ie);
}
if (linkurl != null) {
linkurl.setImageAlt(altakk.toString().trim());
}
scraper.close();
return line;

@ -27,10 +27,11 @@ package net.yacy.document.parser.html;
import java.util.Comparator;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry> {
private final AnchorURL imageurl;
private final DigestURL imageurl;
private AnchorURL linkurl;
private final String alt;
private final int width, height;
@ -48,7 +49,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
* @param fileSize the number of bytes that the image uses on file or -1 if unknown
*/
public ImageEntry(
final AnchorURL imageurl,
final DigestURL imageurl,
final String alt,
final int width,
final int height,
@ -62,7 +63,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
this.fileSize = fileSize;
}
public AnchorURL url() {
public DigestURL url() {
return this.imageurl;
}

@ -123,7 +123,7 @@ public class htmlParser extends AbstractParser implements Parser {
sections[p++] = headline;
}
}
LinkedHashMap<AnchorURL, ImageEntry> noDoubleImages = new LinkedHashMap<AnchorURL, ImageEntry>();
LinkedHashMap<DigestURL, ImageEntry> noDoubleImages = new LinkedHashMap<>();
for (ImageEntry ie: scraper.getImages()) noDoubleImages.put(ie.url(), ie);
final Document ppd = new Document(
location,

@ -46,6 +46,7 @@ import java.util.Set;
import javax.imageio.ImageIO;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
@ -200,10 +201,10 @@ public class genericImageParser extends AbstractParser implements Parser {
final HashSet<String> languages = new HashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final LinkedHashMap<DigestURL, ImageEntry> images = new LinkedHashMap<>();
// add this image to the map of images
final String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
images.put(ii.location, new ImageEntry(ii.location, "", ii.width, ii.height, -1));
if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(filename);

@ -38,6 +38,7 @@ import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -98,7 +99,7 @@ public class rssParser extends AbstractParser implements Parser {
null,
anchors,
null,
new LinkedHashMap<AnchorURL, ImageEntry>(),
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
docs.add(doc);

@ -96,7 +96,7 @@ public class sitemapParser extends AbstractParser implements Parser {
null,
null,
null,
new LinkedHashMap<AnchorURL, ImageEntry>(),
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
new Date());
docs.add(doc);

@ -571,7 +571,7 @@ public class Segment {
}
}
// media links as well!
for (AnchorURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (DigestURL image: document.getImages().keySet()) urlCitationIndex.add(image.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL audio: document.getAudiolinks().keySet()) urlCitationIndex.add(audio.hash(), new CitationReference(url.hash(), loadDate.getTime()));
for (AnchorURL video: document.getVideolinks().keySet()) urlCitationIndex.add(video.hash(), new CitationReference(url.hash(), loadDate.getTime()));
} catch (Throwable e) {

@ -109,15 +109,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
for (final AnchorURL target_url: links) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, target_url, null);
target_order++;
// add the edge to the subgraph
edges.add(edge);
}
for (final ImageEntry image_url: images) {
SolrInputDocument edge = getEdge(
subgraph, source, responseHeader, collections, crawldepth_source, processTypes,
sourceName, allAttr, generalNofollow, target_order, image_url.url(), image_url.alt());
sourceName, allAttr, generalNofollow, target_order, target_url);
target_order++;
// add the edge to the subgraph
edges.add(edge);
@ -128,7 +120,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public SolrInputDocument getEdge(
final Subgraph subgraph, final DigestURL source_url, final ResponseHeader responseHeader, Map<String, Pattern> collections,
int crawldepth_source, final Set<ProcessType> processTypes, final String sourceName, boolean allAttr, boolean generalNofollow, int target_order,
AnchorURL target_url, final String targetImageAlt /*only filled if target is an image, null otherwise*/) {
AnchorURL target_url) {
final String name = target_url.getNameProperty(); // the name attribute
final String text = target_url.getTextProperty(); // the text between the <a></a> tag
@ -219,10 +211,10 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
if (allAttr || contains(WebgraphSchema.target_linktext_charcount_i)) add(edge, WebgraphSchema.target_linktext_charcount_i, target_url.getTextProperty().length());
if (allAttr || contains(WebgraphSchema.target_linktext_wordcount_i)) add(edge, WebgraphSchema.target_linktext_wordcount_i, target_url.getTextProperty().length() > 0 ? CommonPattern.SPACE.split(target_url.getTextProperty()).length : 0);
if (targetImageAlt != null) {
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, targetImageAlt);
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, targetImageAlt.length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, targetImageAlt.length() > 0 ? CommonPattern.SPACE.split(targetImageAlt).length : 0);
if (target_url.getImageAlt() != null) {
if (allAttr || contains(WebgraphSchema.target_alt_t)) add(edge, WebgraphSchema.target_alt_t, target_url.getImageAlt());
if (allAttr || contains(WebgraphSchema.target_alt_charcount_i)) add(edge, WebgraphSchema.target_alt_charcount_i, target_url.getImageAlt().length());
if (allAttr || contains(WebgraphSchema.target_alt_wordcount_i)) add(edge, WebgraphSchema.target_alt_wordcount_i, target_url.getImageAlt().length() > 0 ? CommonPattern.SPACE.split(target_url.getImageAlt()).length : 0);
}
// add the target attributes

Loading…
Cancel
Save