From cf62b571bd262f66984f40facdb5c0592a76ddb2 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 20 Mar 2018 07:38:29 +0100 Subject: [PATCH] Added RSS reader support for `enclosure` feed item sub element. Enclosure element (see http://www.rssboard.org/rss-specification#ltenclosuregtSubelementOfLtitemgt ) can be seen for example in podcasts feeds. --- source/net/yacy/cora/document/feed/Hit.java | 10 +++ .../yacy/cora/document/feed/RSSMessage.java | 27 ++++-- .../yacy/cora/document/feed/RSSReader.java | 36 ++++++-- .../net/yacy/document/parser/rssParser.java | 85 +++++++++++++------ 4 files changed, 121 insertions(+), 37 deletions(-) diff --git a/source/net/yacy/cora/document/feed/Hit.java b/source/net/yacy/cora/document/feed/Hit.java index 34f486ddd..3cb61ab71 100644 --- a/source/net/yacy/cora/document/feed/Hit.java +++ b/source/net/yacy/cora/document/feed/Hit.java @@ -57,6 +57,11 @@ public interface Hit { public void setSize(long size); + /** + * @param enclosure an URL describing a media object that is attached to a feed item + */ + public void setEnclosure(String enclosure); + public String getAuthor(); public String getCopyright(); @@ -82,6 +87,11 @@ public interface Hit { public String[] getSubject(); public long getSize(); + + /** + * @return an URL describing a media object that is attached to a feed item + */ + public String getEnclosure(); public double getLon(); diff --git a/source/net/yacy/cora/document/feed/RSSMessage.java b/source/net/yacy/cora/document/feed/RSSMessage.java index 804fb8487..a12768a24 100644 --- a/source/net/yacy/cora/document/feed/RSSMessage.java +++ b/source/net/yacy/cora/document/feed/RSSMessage.java @@ -86,6 +86,9 @@ public class RSSMessage implements Hit, Comparable, Comparator, Comparator 0) this.map.put(Token.description.name(), description); this.map.put(Token.link.name(), link.toNormalform(true)); this.map.put(Token.pubDate.name(), HeaderFramework.FORMAT_RFC1123.format(new Date())); - if (guid.length() > 0) this.map.put(Token.guid.name(), guid); + if (guid.length() > 0) { + this.map.put(Token.guid.name(), guid); + } } public RSSMessage() { this.map = new HashMap(); } - + public void setValue(final Token token, final String value) { - if (value.length() > 0) this.map.put(token.name(), value); + if (value.length() > 0) { + this.map.put(token.name(), value); + } } @Override @@ -277,7 +284,12 @@ public class RSSMessage implements Hit, Comparable, Comparator, Comparator 0) this.item.setValue(Token.link, url); + } else if (this.parsingItem) { + if(this.type == Type.atom) { + if ("link".equals(tag)) { + final String linkRelation = atts.getValue("rel"); + if (linkRelation == null || linkRelation.equals("alternate")) { + // atom link handling (rss link is handled in endElement) + final String url = atts.getValue("href"); + if (StringUtils.isNotBlank(url)) { + this.item.setValue(Token.link, url); + } + } else if("enclosure".equals(linkRelation)) { + /* Atom rel="enclosure" link type */ + final String url = atts.getValue("href"); + if(StringUtils.isNotBlank(url)) { + this.item.setEnclosure(url); + } + } + } + } else if(this.type == Type.rss) { + /* RSS 0.92 and 2.0 element */ + if ("enclosure".equals(tag)) { + final String url = atts.getValue("url"); + if(StringUtils.isNotBlank(url)) { + this.item.setEnclosure(url); + } + } + } } else if ("rss".equals(tag)) { this.type = Type.rss; } @@ -189,7 +213,9 @@ public class RSSReader extends DefaultHandler { } else if (this.parsingItem) { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); - if (RSSMessage.tags.contains(tag) && value.length() > 0) this.item.setValue(RSSMessage.valueOfNick(tag), value); + if (RSSMessage.tags.contains(tag) && value.length() > 0) { + this.item.setValue(RSSMessage.valueOfNick(tag), value); + } } else if (this.parsingChannel) { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 3b88ff10c..2da3e73da 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -34,10 +34,14 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Set; +import org.apache.commons.lang.StringUtils; + import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.feed.RSSReader; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.document.AbstractParser; import net.yacy.document.Document; import net.yacy.document.Parser; @@ -46,6 +50,8 @@ import net.yacy.document.VocabularyScraper; import net.yacy.document.parser.html.ImageEntry; public class rssParser extends AbstractParser implements Parser { + + private final static ConcurrentLog LOG = new ConcurrentLog(rssParser.class.getSimpleName()); public rssParser() { super("RSS Parser"); @@ -67,7 +73,7 @@ public class rssParser extends AbstractParser implements Parser { final int timezoneOffset, final InputStream source) throws Failure, InterruptedException { - RSSReader rssReader; + final RSSReader rssReader; try { rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); } catch (final IOException e) { @@ -89,34 +95,59 @@ public class rssParser extends AbstractParser implements Parser { DigestURL itemuri; Set languages; Document doc; - for (final Hit item: feed) { - try { - itemuri = new DigestURL(item.getLink()); - languages = new HashSet(); - languages.add(item.getLanguage()); + for (final Hit item : feed) { + final String linkUrlString = item.getLink(); + itemuri = null; + if(StringUtils.isNotBlank(linkUrlString)) { + /* Link element is optional in RSS 2.0 and Atom */ + try { + itemuri = new DigestURL(item.getLink()); + } catch(final MalformedURLException e) { + LOG.warn("Malformed feed item link url : " + linkUrlString); + } + } + languages = new HashSet(); + languages.add(item.getLanguage()); + + Set anchors = null; + final String enclosureUrlString = item.getEnclosure(); + if(StringUtils.isNotBlank(enclosureUrlString)) { + try { + final AnchorURL enclosureUrl = new AnchorURL(enclosureUrlString); + if(itemuri == null) { + /* No element in this item : the enclosure URL is used as the sub document main location URL */ + itemuri = enclosureUrl; + } else { + anchors = new HashSet<>(); + anchors.add(enclosureUrl); + } + } catch(final MalformedURLException e) { + LOG.warn("Malformed feed item enclosure url : " + enclosureUrlString); + } + } + + if(itemuri != null) { doc = new Document( - itemuri, - TextParser.mimeOf(itemuri), - charset, - this, - languages, - item.getSubject(), - singleList(item.getTitle()), - item.getAuthor(), - item.getCopyright(), - null, - item.getDescriptions(), - item.getLon(), - item.getLat(), - null, - null, - null, - new LinkedHashMap(), - false, - item.getPubDate()); + itemuri, + TextParser.mimeOf(itemuri), + charset, + this, + languages, + item.getSubject(), + singleList(item.getTitle()), + item.getAuthor(), + item.getCopyright(), + null, + item.getDescriptions(), + item.getLon(), + item.getLat(), + null, + anchors, + null, + new LinkedHashMap(), + false, + item.getPubDate()); docs.add(doc); - } catch (final MalformedURLException e) { - continue; } }