From e45afedee4b9c809a4e3ac3da002dd93283543be Mon Sep 17 00:00:00 2001 From: luccioman Date: Wed, 21 Mar 2018 08:22:29 +0100 Subject: [PATCH] Added support for enclosures (media links) to the RSS loader --- htroot/Load_RSS_p.html | 4 +- htroot/Load_RSS_p.java | 126 +++++++++++++----- .../net/yacy/crawler/retrieval/RSSLoader.java | 30 ++++- 3 files changed, 123 insertions(+), 37 deletions(-) diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html index 520242dac..bf67c3fb9 100644 --- a/htroot/Load_RSS_p.html +++ b/htroot/Load_RSS_p.html @@ -154,14 +154,14 @@ #{item}# - #(indexable)# ::#(/indexable)# + #(indexable)# ::#(/indexable)# #(state)#new::enqueued::indexed#(/state)# #(hasLink)#::#(/hasLink)##[title]##(hasLink)#::#(/hasLink)# #(hasLink)#::#[link]##(/hasLink)# #[author]# #[language]# #[date]# - #[description]# + #[description]##(defaultMediaDesc)#::Attached media#(/defaultMediaDesc)# #{/item}# diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index b21adaf70..7ce395de3 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -60,6 +60,12 @@ import net.yacy.server.serverSwitch; public class Load_RSS_p { + /** Value prefix of checkbox inputs used to select items */ + private static final String CHECKBOX_ITEM_PREFIX = "mark_"; + + /** Value prefix of checkbox inputs used to select media items */ + private static final String CHECKBOX_MEDIA_ITEM_PREFIX = "media_"; + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { final serverObjects prop = new serverObjects(); @@ -79,8 +85,8 @@ public class Load_RSS_p { if (post != null && post.containsKey("removeSelectedFeedsNewList")) { for (final Map.Entry entry: post.entrySet()) { - if (entry.getValue().startsWith("mark_")) try { - sb.tables.delete("rss", entry.getValue().substring(5).getBytes()); + if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) try { + sb.tables.delete("rss", entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes()); } catch (final IOException e) { ConcurrentLog.logException(e); } @@ -114,8 +120,8 @@ public class Load_RSS_p { if (post != null && post.containsKey("removeSelectedFeedsScheduler")) { for (final Map.Entry entry: post.entrySet()) { - if (entry.getValue().startsWith("mark_")) try { - final byte[] pk = entry.getValue().substring(5).getBytes(); + if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) try { + final byte[] pk = entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes(); final Row rssRow = sb.tables.select("rss", pk); final byte[] schedulerPK = rssRow.get("api_pk", (byte[]) null); if (schedulerPK != null) sb.tables.delete("api", schedulerPK); @@ -161,10 +167,10 @@ public class Load_RSS_p { if (post != null && post.containsKey("addSelectedFeedScheduler")) { ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); for (final Map.Entry entry: post.entrySet()) { - if (entry.getValue().startsWith("mark_")) { + if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) { Row row; try { - final byte [] pk = entry.getValue().substring(5).getBytes(); + final byte [] pk = entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length()).getBytes(); row = sb.tables.select("rss", pk); } catch (final IOException e) { ConcurrentLog.logException(e); @@ -289,8 +295,9 @@ public class Load_RSS_p { final RSSFeed feed = rss.getFeed(); final Map hash2UrlMap = new HashMap(); loop: for (final Map.Entry entry: post.entrySet()) { - if (entry.getValue().startsWith("mark_")) { - final RSSMessage message = feed.getMessage(entry.getValue().substring(5)); + if (entry.getValue().startsWith(CHECKBOX_ITEM_PREFIX)) { + /* Process selected item links */ + final RSSMessage message = feed.getMessage(entry.getValue().substring(CHECKBOX_ITEM_PREFIX.length())); if(message == null || StringUtils.isBlank(message.getLink())) { /* Link element is optional in RSS 2.0 and Atom */ continue loop; @@ -306,6 +313,24 @@ public class Load_RSS_p { continue loop; } hash2UrlMap.put(ASCII.String(messageUrl.hash()), messageUrl); + } else if(entry.getValue().startsWith(CHECKBOX_MEDIA_ITEM_PREFIX)) { + /* Process selected item enclosure (media) links */ + final RSSMessage message = feed.getMessage(entry.getValue().substring(CHECKBOX_MEDIA_ITEM_PREFIX.length())); + if(message == null || StringUtils.isBlank(message.getEnclosure())) { + /* Enclosure element is optional */ + continue loop; + } + DigestURL mediaUrl; + try { + mediaUrl = new DigestURL(message.getEnclosure()); + } catch (MalformedURLException e) { + ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + message.getEnclosure()); + continue loop; + } + if (RSSLoader.indexTriggered.containsKey(mediaUrl.hash())) { + continue loop; + } + hash2UrlMap.put(ASCII.String(mediaUrl.hash()), mediaUrl); } } @@ -366,6 +391,21 @@ public class Load_RSS_p { ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr); } } + + DigestURL enclosure = null; + final String enclosureStr = item.getEnclosure(); + if(StringUtils.isNotBlank(enclosureStr)) { + try { + enclosure = new DigestURL(enclosureStr); + } catch (final MalformedURLException e) { + ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + enclosureStr); + } + } + + if(link == null) { + /* No link in this feed item : we use the enclosure media URL as the main link */ + link = enclosure; + } author = item.getAuthor(); if (author == null) { @@ -374,32 +414,56 @@ public class Load_RSS_p { pubDate = item.getPubDate(); HarvestProcess harvestProcess; - try { - if(link != null && StringUtils.isNotEmpty(item.getGuid())) { - harvestProcess = sb.urlExists(ASCII.String(link.hash())); + try { + if(link != null && StringUtils.isNotEmpty(item.getGuid())) { + harvestProcess = sb.urlExists(ASCII.String(link.hash())); + + prop.put("showitems_item_" + i + "_hasLink", true); + prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true)); + final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0; + prop.put("showitems_item_" + i + "_state", state); + prop.put("showitems_item_" + i + "_indexable", state == 0); + prop.put("showitems_item_" + i + "_indexable_count", i); + prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid()); + } else { + prop.put("showitems_item_" + i + "_state", 0); + prop.put("showitems_item_" + i + "_indexable", false); + prop.put("showitems_item_" + i + "_hasLink", false); + } + prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); + prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); + prop.put("showitems_item_" + i + "_defaultMediaDesc", false); + prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); + prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); + i++; + } catch (IOException e) { + ConcurrentLog.logException(e); + } + + try { + if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) { + harvestProcess = sb.urlExists(ASCII.String(enclosure.hash())); - prop.put("showitems_item_" + i + "_hasLink", true); - prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true)); - final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0; - prop.put("showitems_item_" + i + "_state", state); - prop.put("showitems_item_" + i + "_indexable", state == 0); - prop.put("showitems_item_" + i + "_indexable_count", i); - prop.putHTML("showitems_item_" + i + "_indexable_guid", item.getGuid()); - } else { - prop.put("showitems_item_" + i + "_state", 0); - prop.put("showitems_item_" + i + "_indexable", false); - prop.put("showitems_item_" + i + "_hasLink", false); - } - prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + prop.put("showitems_item_" + i + "_hasLink", true); + prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true)); + final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0; + prop.put("showitems_item_" + i + "_state", state); + prop.put("showitems_item_" + i + "_indexable", state == 0); + prop.put("showitems_item_" + i + "_indexable_count", i); + prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid()); + prop.putHTML("showitems_item_" + i + "_author", ""); prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); - prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); - prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); - prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); + prop.putHTML("showitems_item_" + i + "_description", ""); + /* Description is already used for the main item link, use here a default one */ + prop.put("showitems_item_" + i + "_defaultMediaDesc", true); + prop.putHTML("showitems_item_" + i + "_language", ""); + prop.putHTML("showitems_item_" + i + "_date", ""); i++; - } catch (IOException e) { - ConcurrentLog.logException(e); - continue; - } + } + } catch (IOException e) { + ConcurrentLog.logException(e); + } } prop.put("showitems_item", i); prop.put("showitems_num", i); diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index d3e85526b..37324fdcf 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -97,7 +97,14 @@ public class RSSLoader extends Thread { recordAPI(this.sb, null, this.urlf, feed, 7, "seldays"); } - public static void indexAllRssFeed(final Switchboard sb, final DigestURL url, final RSSFeed feed, Map collections) { + /** + * Iterate over the given feed and add all item links and enclosures URLs to a new switchboard indexing task. + * @param sb the main environment switchboard instance. Must not be null. + * @param feedUrl the feed url. Must not be null. + * @param feed the parsed feed. Must not be null. + * @param collections + */ + public static void indexAllRssFeed(final Switchboard sb, final DigestURL feedUrl, final RSSFeed feed, final Map collections) { int loadCount = 0; final Map urlmap = new HashMap(); for (final RSSMessage message: feed) { @@ -114,6 +121,21 @@ public class RSSLoader extends Thread { ConcurrentLog.warn("Load_RSS", "Malformed feed item link URL : " + linkStr); } } + + /* An enclosure (media) URL may also be defined for that item */ + final String enclosureStr = message.getEnclosure(); + if(StringUtils.isNotBlank(enclosureStr)) { // Link element is optional in RSS 2.0 and Atom + DigestURL enclosureUrl; + try { + enclosureUrl = new DigestURL(enclosureStr); + if (indexTriggered.containsKey(enclosureUrl.hash())) { + continue; + } + urlmap.put(ASCII.String(enclosureUrl.hash()), enclosureUrl); + } catch (MalformedURLException e1) { + ConcurrentLog.warn("Load_RSS", "Malformed feed item enclosure URL : " + enclosureStr); + } + } } final List list = new ArrayList(); @@ -135,7 +157,7 @@ public class RSSLoader extends Thread { // update info for loading try { - Tables.Data rssRow = sb.tables.select("rss", url.hash()); + Tables.Data rssRow = sb.tables.select("rss", feedUrl.hash()); if (rssRow == null) rssRow = new Tables.Data(); final Date lastLoadDate = rssRow.get("last_load_date", new Date(0)); final long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24); @@ -143,13 +165,13 @@ public class RSSLoader extends Thread { final int lastAvg = rssRow.get("avg_upd_per_day", 0); final long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount; final long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3; - rssRow.put("url", UTF8.getBytes(url.toNormalform(true))); + rssRow.put("url", UTF8.getBytes(feedUrl.toNormalform(true))); rssRow.put("title", feed.getChannel().getTitle()); rssRow.put("last_load_date", new Date()); rssRow.put("last_load_count", loadCount); rssRow.put("all_load_count", allLoadCount + loadCount); rssRow.put("avg_upd_per_day", nextAvg); - sb.tables.update("rss", url.hash(), rssRow); + sb.tables.update("rss", feedUrl.hash(), rssRow); } catch (final IOException e) { ConcurrentLog.logException(e); } catch (final SpaceExceededException e) {