Added RSS reader support for `enclosure` feed item sub element.

Enclosure element (see
http://www.rssboard.org/rss-specification#ltenclosuregtSubelementOfLtitemgt
) can be seen for example in podcasts feeds.
pull/167/merge
luccioman 7 years ago
parent e5f5de0fc7
commit cf62b571bd

@ -57,6 +57,11 @@ public interface Hit {
public void setSize(long size); public void setSize(long size);
/**
* @param enclosure an URL describing a media object that is attached to a feed item
*/
public void setEnclosure(String enclosure);
public String getAuthor(); public String getAuthor();
public String getCopyright(); public String getCopyright();
@ -83,6 +88,11 @@ public interface Hit {
public long getSize(); public long getSize();
/**
* @return an URL describing a media object that is attached to a feed item
*/
public String getEnclosure();
public double getLon(); public double getLon();
public double getLat(); public double getLat();

@ -86,6 +86,9 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
/** A string that uniquely identifies an item (RSS 2.0) */ /** A string that uniquely identifies an item (RSS 2.0) */
guid(new String[]{"guid"}), guid(new String[]{"guid"}),
/** URL describing a media object that is attached to a feed item */
enclosure(new String[]{"enclosure"}),
/** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */ /** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */
ttl(new String[]{"ttl"}), ttl(new String[]{"ttl"}),
@ -163,7 +166,9 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
if (description.length() > 0) this.map.put(Token.description.name(), description); if (description.length() > 0) this.map.put(Token.description.name(), description);
this.map.put(Token.link.name(), link.toNormalform(true)); this.map.put(Token.link.name(), link.toNormalform(true));
this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date())); this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date()));
if (guid.length() > 0) this.map.put(Token.guid.name(), guid); if (guid.length() > 0) {
this.map.put(Token.guid.name(), guid);
}
} }
public RSSMessage() { public RSSMessage() {
@ -171,7 +176,9 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
} }
public void setValue(final Token token, final String value) { public void setValue(final Token token, final String value) {
if (value.length() > 0) this.map.put(token.name(), value); if (value.length() > 0) {
this.map.put(token.name(), value);
}
} }
@Override @Override
@ -278,6 +285,11 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
return guid; return guid;
} }
@Override
public String getEnclosure() {
return Token.enclosure.valueFrom(this.map, "");
}
public String getTTL() { public String getTTL() {
return Token.ttl.valueFrom(this.map, ""); return Token.ttl.valueFrom(this.map, "");
} }
@ -372,6 +384,11 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
setValue(Token.guid, guid); setValue(Token.guid, guid);
} }
@Override
public void setEnclosure(final String enclosure) {
setValue(Token.enclosure, enclosure);
}
@Override @Override
public void setLanguage(final String language) { public void setLanguage(final String language) {
setValue(Token.language, language); setValue(Token.language, language);

@ -30,6 +30,7 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver; import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
@ -168,10 +169,33 @@ public class RSSReader extends DefaultHandler {
} }
this.item = new RSSMessage(); this.item = new RSSMessage();
this.parsingItem = true; this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) { } else if (this.parsingItem) {
if(this.type == Type.atom) {
if ("link".equals(tag)) {
final String linkRelation = atts.getValue("rel");
if (linkRelation == null || linkRelation.equals("alternate")) {
// atom link handling (rss link is handled in endElement) // atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href"); final String url = atts.getValue("href");
if (url != null && url.length() > 0) this.item.setValue(Token.link, url); if (StringUtils.isNotBlank(url)) {
this.item.setValue(Token.link, url);
}
} else if("enclosure".equals(linkRelation)) {
/* Atom rel="enclosure" link type */
final String url = atts.getValue("href");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if(this.type == Type.rss) {
/* RSS 0.92 and 2.0 <enclosure> element */
if ("enclosure".equals(tag)) {
final String url = atts.getValue("url");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if ("rss".equals(tag)) { } else if ("rss".equals(tag)) {
this.type = Type.rss; this.type = Type.rss;
} }
@ -189,7 +213,9 @@ public class RSSReader extends DefaultHandler {
} else if (this.parsingItem) { } else if (this.parsingItem) {
final String value = this.buffer.toString().trim(); final String value = this.buffer.toString().trim();
this.buffer.setLength(0); this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value); if (RSSMessage.tags.contains(tag) && value.length() > 0) {
this.item.setValue(RSSMessage.valueOfNick(tag), value);
}
} else if (this.parsingChannel) { } else if (this.parsingChannel) {
final String value = this.buffer.toString().trim(); final String value = this.buffer.toString().trim();
this.buffer.setLength(0); this.buffer.setLength(0);

@ -34,10 +34,14 @@ import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang.StringUtils;
import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -47,6 +51,8 @@ import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser { public class rssParser extends AbstractParser implements Parser {
private final static ConcurrentLog LOG = new ConcurrentLog(rssParser.class.getSimpleName());
public rssParser() { public rssParser() {
super("RSS Parser"); super("RSS Parser");
this.SUPPORTED_EXTENSIONS.add("rss"); this.SUPPORTED_EXTENSIONS.add("rss");
@ -67,7 +73,7 @@ public class rssParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
final InputStream source) final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
RSSReader rssReader; final RSSReader rssReader;
try { try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) { } catch (final IOException e) {
@ -89,11 +95,38 @@ public class rssParser extends AbstractParser implements Parser {
DigestURL itemuri; DigestURL itemuri;
Set<String> languages; Set<String> languages;
Document doc; Document doc;
for (final Hit item: feed) { for (final Hit item : feed) {
final String linkUrlString = item.getLink();
itemuri = null;
if(StringUtils.isNotBlank(linkUrlString)) {
/* Link element is optional in RSS 2.0 and Atom */
try { try {
itemuri = new DigestURL(item.getLink()); itemuri = new DigestURL(item.getLink());
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item link url : " + linkUrlString);
}
}
languages = new HashSet<String>(); languages = new HashSet<String>();
languages.add(item.getLanguage()); languages.add(item.getLanguage());
Set<AnchorURL> anchors = null;
final String enclosureUrlString = item.getEnclosure();
if(StringUtils.isNotBlank(enclosureUrlString)) {
try {
final AnchorURL enclosureUrl = new AnchorURL(enclosureUrlString);
if(itemuri == null) {
/* No <link> element in this item : the enclosure URL is used as the sub document main location URL */
itemuri = enclosureUrl;
} else {
anchors = new HashSet<>();
anchors.add(enclosureUrl);
}
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item enclosure url : " + enclosureUrlString);
}
}
if(itemuri != null) {
doc = new Document( doc = new Document(
itemuri, itemuri,
TextParser.mimeOf(itemuri), TextParser.mimeOf(itemuri),
@ -109,14 +142,12 @@ public class rssParser extends AbstractParser implements Parser {
item.getLon(), item.getLon(),
item.getLat(), item.getLat(),
null, null,
null, anchors,
null, null,
new LinkedHashMap<DigestURL, ImageEntry>(), new LinkedHashMap<DigestURL, ImageEntry>(),
false, false,
item.getPubDate()); item.getPubDate());
docs.add(doc); docs.add(doc);
} catch (final MalformedURLException e) {
continue;
} }
} }

Loading…
Cancel
Save