Added RSS reader support for `enclosure` feed item sub element.

Enclosure element (see
http://www.rssboard.org/rss-specification#ltenclosuregtSubelementOfLtitemgt
) can be seen for example in podcasts feeds.
pull/167/merge
luccioman 7 years ago
parent e5f5de0fc7
commit cf62b571bd

@ -57,6 +57,11 @@ public interface Hit {
public void setSize(long size); public void setSize(long size);
/**
* @param enclosure an URL describing a media object that is attached to a feed item
*/
public void setEnclosure(String enclosure);
public String getAuthor(); public String getAuthor();
public String getCopyright(); public String getCopyright();
@ -82,6 +87,11 @@ public interface Hit {
public String[] getSubject(); public String[] getSubject();
public long getSize(); public long getSize();
/**
* @return an URL describing a media object that is attached to a feed item
*/
public String getEnclosure();
public double getLon(); public double getLon();

@ -86,6 +86,9 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
/** A string that uniquely identifies an item (RSS 2.0) */ /** A string that uniquely identifies an item (RSS 2.0) */
guid(new String[]{"guid"}), guid(new String[]{"guid"}),
/** URL describing a media object that is attached to a feed item */
enclosure(new String[]{"enclosure"}),
/** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */ /** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */
ttl(new String[]{"ttl"}), ttl(new String[]{"ttl"}),
@ -163,15 +166,19 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
if (description.length() > 0) this.map.put(Token.description.name(), description); if (description.length() > 0) this.map.put(Token.description.name(), description);
this.map.put(Token.link.name(), link.toNormalform(true)); this.map.put(Token.link.name(), link.toNormalform(true));
this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date())); this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date()));
if (guid.length() > 0) this.map.put(Token.guid.name(), guid); if (guid.length() > 0) {
this.map.put(Token.guid.name(), guid);
}
} }
public RSSMessage() { public RSSMessage() {
this.map = new HashMap<String, String>(); this.map = new HashMap<String, String>();
} }
public void setValue(final Token token, final String value) { public void setValue(final Token token, final String value) {
if (value.length() > 0) this.map.put(token.name(), value); if (value.length() > 0) {
this.map.put(token.name(), value);
}
} }
@Override @Override
@ -277,7 +284,12 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
} }
return guid; return guid;
} }
@Override
public String getEnclosure() {
return Token.enclosure.valueFrom(this.map, "");
}
public String getTTL() { public String getTTL() {
return Token.ttl.valueFrom(this.map, ""); return Token.ttl.valueFrom(this.map, "");
} }
@ -371,7 +383,12 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
public void setGuid(final String guid) { public void setGuid(final String guid) {
setValue(Token.guid, guid); setValue(Token.guid, guid);
} }
@Override
public void setEnclosure(final String enclosure) {
setValue(Token.enclosure, enclosure);
}
@Override @Override
public void setLanguage(final String language) { public void setLanguage(final String language) {
setValue(Token.language, language); setValue(Token.language, language);

@ -30,6 +30,7 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes; import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver; import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource; import org.xml.sax.InputSource;
@ -168,10 +169,33 @@ public class RSSReader extends DefaultHandler {
} }
this.item = new RSSMessage(); this.item = new RSSMessage();
this.parsingItem = true; this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) { } else if (this.parsingItem) {
// atom link handling (rss link is handled in endElement) if(this.type == Type.atom) {
final String url = atts.getValue("href"); if ("link".equals(tag)) {
if (url != null && url.length() > 0) this.item.setValue(Token.link, url); final String linkRelation = atts.getValue("rel");
if (linkRelation == null || linkRelation.equals("alternate")) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (StringUtils.isNotBlank(url)) {
this.item.setValue(Token.link, url);
}
} else if("enclosure".equals(linkRelation)) {
/* Atom rel="enclosure" link type */
final String url = atts.getValue("href");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if(this.type == Type.rss) {
/* RSS 0.92 and 2.0 <enclosure> element */
if ("enclosure".equals(tag)) {
final String url = atts.getValue("url");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if ("rss".equals(tag)) { } else if ("rss".equals(tag)) {
this.type = Type.rss; this.type = Type.rss;
} }
@ -189,7 +213,9 @@ public class RSSReader extends DefaultHandler {
} else if (this.parsingItem) { } else if (this.parsingItem) {
final String value = this.buffer.toString().trim(); final String value = this.buffer.toString().trim();
this.buffer.setLength(0); this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value); if (RSSMessage.tags.contains(tag) && value.length() > 0) {
this.item.setValue(RSSMessage.valueOfNick(tag), value);
}
} else if (this.parsingChannel) { } else if (this.parsingChannel) {
final String value = this.buffer.toString().trim(); final String value = this.buffer.toString().trim();
this.buffer.setLength(0); this.buffer.setLength(0);

@ -34,10 +34,14 @@ import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang.StringUtils;
import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader; import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
import net.yacy.document.Document; import net.yacy.document.Document;
import net.yacy.document.Parser; import net.yacy.document.Parser;
@ -46,6 +50,8 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser { public class rssParser extends AbstractParser implements Parser {
private final static ConcurrentLog LOG = new ConcurrentLog(rssParser.class.getSimpleName());
public rssParser() { public rssParser() {
super("RSS Parser"); super("RSS Parser");
@ -67,7 +73,7 @@ public class rssParser extends AbstractParser implements Parser {
final int timezoneOffset, final int timezoneOffset,
final InputStream source) final InputStream source)
throws Failure, InterruptedException { throws Failure, InterruptedException {
RSSReader rssReader; final RSSReader rssReader;
try { try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) { } catch (final IOException e) {
@ -89,34 +95,59 @@ public class rssParser extends AbstractParser implements Parser {
DigestURL itemuri; DigestURL itemuri;
Set<String> languages; Set<String> languages;
Document doc; Document doc;
for (final Hit item: feed) { for (final Hit item : feed) {
try { final String linkUrlString = item.getLink();
itemuri = new DigestURL(item.getLink()); itemuri = null;
languages = new HashSet<String>(); if(StringUtils.isNotBlank(linkUrlString)) {
languages.add(item.getLanguage()); /* Link element is optional in RSS 2.0 and Atom */
try {
itemuri = new DigestURL(item.getLink());
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item link url : " + linkUrlString);
}
}
languages = new HashSet<String>();
languages.add(item.getLanguage());
Set<AnchorURL> anchors = null;
final String enclosureUrlString = item.getEnclosure();
if(StringUtils.isNotBlank(enclosureUrlString)) {
try {
final AnchorURL enclosureUrl = new AnchorURL(enclosureUrlString);
if(itemuri == null) {
/* No <link> element in this item : the enclosure URL is used as the sub document main location URL */
itemuri = enclosureUrl;
} else {
anchors = new HashSet<>();
anchors.add(enclosureUrl);
}
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item enclosure url : " + enclosureUrlString);
}
}
if(itemuri != null) {
doc = new Document( doc = new Document(
itemuri, itemuri,
TextParser.mimeOf(itemuri), TextParser.mimeOf(itemuri),
charset, charset,
this, this,
languages, languages,
item.getSubject(), item.getSubject(),
singleList(item.getTitle()), singleList(item.getTitle()),
item.getAuthor(), item.getAuthor(),
item.getCopyright(), item.getCopyright(),
null, null,
item.getDescriptions(), item.getDescriptions(),
item.getLon(), item.getLon(),
item.getLat(), item.getLat(),
null, null,
null, anchors,
null, null,
new LinkedHashMap<DigestURL, ImageEntry>(), new LinkedHashMap<DigestURL, ImageEntry>(),
false, false,
item.getPubDate()); item.getPubDate());
docs.add(doc); docs.add(doc);
} catch (final MalformedURLException e) {
continue;
} }
} }

Loading…
Cancel
Save