Added RSS reader support for `enclosure` feed item sub element.

Enclosure element (see
http://www.rssboard.org/rss-specification#ltenclosuregtSubelementOfLtitemgt
) can be seen for example in podcasts feeds.
pull/167/merge
luccioman 7 years ago
parent e5f5de0fc7
commit cf62b571bd

@ -57,6 +57,11 @@ public interface Hit {
public void setSize(long size);
/**
* @param enclosure an URL describing a media object that is attached to a feed item
*/
public void setEnclosure(String enclosure);
public String getAuthor();
public String getCopyright();
@ -82,6 +87,11 @@ public interface Hit {
public String[] getSubject();
public long getSize();
/**
* @return an URL describing a media object that is attached to a feed item
*/
public String getEnclosure();
public double getLon();

@ -86,6 +86,9 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
/** A string that uniquely identifies an item (RSS 2.0) */
guid(new String[]{"guid"}),
/** URL describing a media object that is attached to a feed item */
enclosure(new String[]{"enclosure"}),
/** Time To Live : number of minutes that indicates how long a channel (RSS 2.0) can be cached before refreshing from the source. */
ttl(new String[]{"ttl"}),
@ -163,15 +166,19 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
if (description.length() > 0) this.map.put(Token.description.name(), description);
this.map.put(Token.link.name(), link.toNormalform(true));
this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date()));
if (guid.length() > 0) this.map.put(Token.guid.name(), guid);
if (guid.length() > 0) {
this.map.put(Token.guid.name(), guid);
}
}
public RSSMessage() {
this.map = new HashMap<String, String>();
}
public void setValue(final Token token, final String value) {
if (value.length() > 0) this.map.put(token.name(), value);
if (value.length() > 0) {
this.map.put(token.name(), value);
}
}
@Override
@ -277,7 +284,12 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
}
return guid;
}
@Override
public String getEnclosure() {
return Token.enclosure.valueFrom(this.map, "");
}
public String getTTL() {
return Token.ttl.valueFrom(this.map, "");
}
@ -371,7 +383,12 @@ public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMe
public void setGuid(final String guid) {
setValue(Token.guid, guid);
}
@Override
public void setEnclosure(final String enclosure) {
setValue(Token.enclosure, enclosure);
}
@Override
public void setLanguage(final String language) {
setValue(Token.language, language);

@ -30,6 +30,7 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.lang.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
@ -168,10 +169,33 @@ public class RSSReader extends DefaultHandler {
}
this.item = new RSSMessage();
this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (url != null && url.length() > 0) this.item.setValue(Token.link, url);
} else if (this.parsingItem) {
if(this.type == Type.atom) {
if ("link".equals(tag)) {
final String linkRelation = atts.getValue("rel");
if (linkRelation == null || linkRelation.equals("alternate")) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (StringUtils.isNotBlank(url)) {
this.item.setValue(Token.link, url);
}
} else if("enclosure".equals(linkRelation)) {
/* Atom rel="enclosure" link type */
final String url = atts.getValue("href");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if(this.type == Type.rss) {
/* RSS 0.92 and 2.0 <enclosure> element */
if ("enclosure".equals(tag)) {
final String url = atts.getValue("url");
if(StringUtils.isNotBlank(url)) {
this.item.setEnclosure(url);
}
}
}
} else if ("rss".equals(tag)) {
this.type = Type.rss;
}
@ -189,7 +213,9 @@ public class RSSReader extends DefaultHandler {
} else if (this.parsingItem) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value);
if (RSSMessage.tags.contains(tag) && value.length() > 0) {
this.item.setValue(RSSMessage.valueOfNick(tag), value);
}
} else if (this.parsingChannel) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);

@ -34,10 +34,14 @@ import java.util.LinkedHashMap;
import java.util.List;
import java.util.Set;
import org.apache.commons.lang.StringUtils;
import net.yacy.cora.document.feed.Hit;
import net.yacy.cora.document.feed.RSSFeed;
import net.yacy.cora.document.feed.RSSReader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -46,6 +50,8 @@ import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.html.ImageEntry;
public class rssParser extends AbstractParser implements Parser {
private final static ConcurrentLog LOG = new ConcurrentLog(rssParser.class.getSimpleName());
public rssParser() {
super("RSS Parser");
@ -67,7 +73,7 @@ public class rssParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
RSSReader rssReader;
final RSSReader rssReader;
try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) {
@ -89,34 +95,59 @@ public class rssParser extends AbstractParser implements Parser {
DigestURL itemuri;
Set<String> languages;
Document doc;
for (final Hit item: feed) {
try {
itemuri = new DigestURL(item.getLink());
languages = new HashSet<String>();
languages.add(item.getLanguage());
for (final Hit item : feed) {
final String linkUrlString = item.getLink();
itemuri = null;
if(StringUtils.isNotBlank(linkUrlString)) {
/* Link element is optional in RSS 2.0 and Atom */
try {
itemuri = new DigestURL(item.getLink());
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item link url : " + linkUrlString);
}
}
languages = new HashSet<String>();
languages.add(item.getLanguage());
Set<AnchorURL> anchors = null;
final String enclosureUrlString = item.getEnclosure();
if(StringUtils.isNotBlank(enclosureUrlString)) {
try {
final AnchorURL enclosureUrl = new AnchorURL(enclosureUrlString);
if(itemuri == null) {
/* No <link> element in this item : the enclosure URL is used as the sub document main location URL */
itemuri = enclosureUrl;
} else {
anchors = new HashSet<>();
anchors.add(enclosureUrl);
}
} catch(final MalformedURLException e) {
LOG.warn("Malformed feed item enclosure url : " + enclosureUrlString);
}
}
if(itemuri != null) {
doc = new Document(
itemuri,
TextParser.mimeOf(itemuri),
charset,
this,
languages,
item.getSubject(),
singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
null,
item.getDescriptions(),
item.getLon(),
item.getLat(),
null,
null,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
itemuri,
TextParser.mimeOf(itemuri),
charset,
this,
languages,
item.getSubject(),
singleList(item.getTitle()),
item.getAuthor(),
item.getCopyright(),
null,
item.getDescriptions(),
item.getLon(),
item.getLat(),
null,
anchors,
null,
new LinkedHashMap<DigestURL, ImageEntry>(),
false,
item.getPubDate());
docs.add(doc);
} catch (final MalformedURLException e) {
continue;
}
}

Loading…
Cancel
Save