From 0010cd9db1d960aed0357639b653aafba94c7e62 Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 25 Aug 2010 18:24:54 +0000 Subject: [PATCH] Support for indexing of RSS feeds! - added a scanning in html parser for rss feeds - storage of rss feed addresses, can be viewed with http://localhost:8080/Tables_p.html?table=rss - rss items retrieved by http://localhost:8080/Load_RSS_p.html (in Index Creation menu) can be selected and indexed - a rss feed retrieved in http://localhost:8080/Load_RSS_p.html can now be fully indexed - indexing of rss feeds can be placed in scheduler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7073 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Load_RSS_p.html | 39 +++++++-- htroot/Load_RSS_p.java | 85 +++++++++++++++++-- htroot/Tables_p.html | 3 +- htroot/Tables_p.java | 53 ++++-------- htroot/api/table_p.java | 24 ++++-- source/de/anomic/search/Segment.java | 2 +- source/de/anomic/search/Switchboard.java | 32 +++++-- source/net/yacy/cora/document/RSSFeed.java | 6 +- source/net/yacy/cora/document/RSSMessage.java | 24 +++--- source/net/yacy/document/Document.java | 21 ++++- source/net/yacy/document/content/DCEntry.java | 1 + .../net/yacy/document/parser/csvParser.java | 1 + .../net/yacy/document/parser/docParser.java | 1 + .../document/parser/html/ContentScraper.java | 16 +++- .../net/yacy/document/parser/htmlParser.java | 1 + .../parser/images/genericImageParser.java | 1 + .../net/yacy/document/parser/odtParser.java | 1 + .../net/yacy/document/parser/ooxmlParser.java | 1 + .../net/yacy/document/parser/pdfParser.java | 1 + .../net/yacy/document/parser/pptParser.java | 1 + source/net/yacy/document/parser/psParser.java | 1 + .../net/yacy/document/parser/rssParser.java | 1 + .../net/yacy/document/parser/rtfParser.java | 1 + .../yacy/document/parser/sevenzipParser.java | 2 +- .../net/yacy/document/parser/swfParser.java | 2 +- .../yacy/document/parser/torrentParser.java | 1 + .../net/yacy/document/parser/vcfParser.java | 1 + .../net/yacy/document/parser/vsdParser.java | 1 + .../net/yacy/document/parser/xlsParser.java | 1 + source/net/yacy/kelondro/blob/Tables.java | 2 +- 30 files changed, 236 insertions(+), 91 deletions(-) diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html index 4776b39a3..9a7b95937 100644 --- a/htroot/Load_RSS_p.html +++ b/htroot/Load_RSS_p.html @@ -36,11 +36,32 @@
URL of the RSS feed
-
Simulation Mode
+
Preview
-
Indexing Mode
-
#(showload)#Available after successful loading of rss feed in simulation mode:: - not yet implemented THIS INTERFACE IS A STUB - DEVELOPMENT IS ONGOING +
Indexing
+
#(showload)#Available after successful loading of rss feed in preview:: + + +
+
once
+
load this feed once now
+
scheduled
+
repeat the feed loading every
+ + automatically. +
+
#(/showload)#
@@ -49,7 +70,7 @@ #(showitems)#::
- +
Title
#[title]#
Author
#[author]#
@@ -62,6 +83,7 @@ + @@ -71,7 +93,8 @@ #{item}# - + + @@ -81,7 +104,11 @@ #{/item}#
State Title URL Author
#(state)#new::enqueued::indexed#(/state)# #[title]# #[link]# #[author]#
+

+ + +

#(/showitems)# diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 1f02d8f30..57b7d2950 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -21,23 +21,33 @@ import java.io.IOException; import java.net.MalformedURLException; import java.text.DateFormat; +import java.util.Date; +import java.util.Map; import net.yacy.cora.document.Hit; import net.yacy.cora.document.RSSFeed; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.RSSReader; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.storage.ARC; +import net.yacy.cora.storage.ComparableARC; +import net.yacy.document.Parser.Failure; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Base64Order; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.retrieval.Response; +import de.anomic.data.WorkTables; +import de.anomic.search.Segments; import de.anomic.search.Switchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; public class Load_RSS_p { + private static final ARC indexTriggered = new ComparableARC(1000, Base64Order.enhancedCoder); + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final serverObjects prop = new serverObjects(); @@ -51,11 +61,17 @@ public class Load_RSS_p { prop.put("url", post.get("url", "")); + int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); + final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays + if (!post.get("repeat", "off").equals("on") && repeat_time > 0) repeat_time = -1; + + boolean record_api = false; + DigestURI url = null; try { url = post.containsKey("url") ? new DigestURI(post.get("url", ""), null) : null; } catch (MalformedURLException e) { - Log.logException(e); + Log.logWarning("Load_RSS_p", "url not well-formed: '" + post.get("url", "") + "'"); } // if we have an url then try to load the rss @@ -69,6 +85,54 @@ public class Load_RSS_p { Log.logException(e); } + // index all selected items: description only + if (rss != null && post.containsKey("indexSelectedItemContent")) { + RSSFeed feed = rss.getFeed(); + loop: for (Map.Entry entry: post.entrySet()) { + if (entry.getValue().startsWith("mark_")) try { + RSSMessage message = feed.getMessage(entry.getValue().substring(5)); + DigestURI messageurl = new DigestURI(message.getLink()); + if (indexTriggered.containsKey(messageurl.hash())) continue loop; + if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop; + sb.addToIndex(messageurl, null, null); + indexTriggered.put(messageurl.hash(), new Date()); + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + } + } + if (rss != null && post.containsKey("indexAllItemContent")) { + record_api = true; + RSSFeed feed = rss.getFeed(); + loop: for (RSSMessage message: feed) { + try { + DigestURI messageurl = new DigestURI(message.getLink()); + if (indexTriggered.containsKey(messageurl.hash()) && post.containsKey("indexSelectedItemContent")) continue loop; + if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop; + sb.addToIndex(messageurl, null, null); + indexTriggered.put(messageurl.hash(), new Date()); + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + } + } + + if (record_api) { + // record API action + if (repeat_time > 0) { + // store as scheduled api call + sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3)); + } else { + // store just a protocol + sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false)); + } + } + + // show items from rss if (rss != null) { prop.put("showitems", 1); RSSFeed feed = rss.getFeed(); @@ -76,27 +140,30 @@ public class Load_RSS_p { prop.putHTML("showitems_title", channel.getTitle()); String author = channel.getAuthor(); if (author == null || author.length() == 0) author = channel.getCopyright(); + Date pubDate = channel.getPubDate(); prop.putHTML("showitems_author", author == null ? "" : author); prop.putHTML("showitems_description", channel.getDescription()); prop.putHTML("showitems_language", channel.getLanguage()); - prop.putHTML("showitems_date", DateFormat.getDateTimeInstance().format(channel.getPubDate())); + prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); prop.putHTML("showitems_ttl", channel.getTTL()); prop.putHTML("showitems_docs", channel.getDocs()); int i = 0; for (final Hit item: feed) { try { - url = new DigestURI(item.getLink(), null); + DigestURI messageurl = new DigestURI(item.getLink(), null); author = item.getAuthor(); if (author == null) author = item.getCopyright(); + pubDate = item.getPubDate(); prop.put("showitems_item_" + i + "_count", i); - prop.putHTML("showitems_item_" + i + "_hash", new String(url.hash())); + prop.put("showitems_item_" + i + "_state", sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null ? 2 : indexTriggered.containsKey(messageurl.hash()) ? 1 : 0); + prop.putHTML("showitems_item_" + i + "_guid", item.getGuid()); prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); - prop.putHTML("showitems_item_" + i + "_link", url.toNormalform(false, false)); + prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(false, false)); prop.putHTML("showitems_item_" + i + "_description", item.getDescription()); prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); - prop.putHTML("showitems_item_" + i + "_date", DateFormat.getDateTimeInstance().format(item.getPubDate())); + prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); i++; } catch (MalformedURLException e) { Log.logException(e); @@ -105,7 +172,11 @@ public class Load_RSS_p { } prop.put("showitems_item", i); prop.put("showitems_num", i); - if (i > 0) prop.put("showload", 1); + prop.putHTML("showitems_rss", url.toNormalform(true, false)); + if (i > 0) { + prop.put("showload", 1); + prop.put("showload_rss", url.toNormalform(true, false)); + } } return prop; diff --git a/htroot/Tables_p.html b/htroot/Tables_p.html index f8794c16f..a2ae0c488 100644 --- a/htroot/Tables_p.html +++ b/htroot/Tables_p.html @@ -28,7 +28,7 @@ #%env/templates/header.template%# #%env/templates/submenuConfig.template%# - + #(showselection)#::

Table Administration

Table Selection @@ -60,6 +60,7 @@
+ #(/showselection)# #(showtable)#::
diff --git a/htroot/Tables_p.java b/htroot/Tables_p.java index bd9d47edb..33bbe0333 100644 --- a/htroot/Tables_p.java +++ b/htroot/Tables_p.java @@ -39,52 +39,35 @@ public class Tables_p { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + + prop.put("showtable", 0); + prop.put("showedit", 0); + prop.put("showselection", 0); - if (post == null) { - prop.put("pattern", ""); - - // show table selection - int count = 0; - Iterator ti = sb.tables.tables(); - String tablename; - while (ti.hasNext()) { - tablename = ti.next(); - prop.put("tables_" + count + "_name", tablename); - prop.put("tables_" + count + "_selected", 0); - count++; - } - prop.put("tables", count); - - // generate table - prop.put("showtable", 0); - prop.put("showedit", 0); - - // the peer address - prop.put("address", sb.peers.mySeed().getPublicAddress()); - - // return rewrite properties - return prop; - } - - String table = post.get("table", null); + String table = (post == null) ? null : post.get("table", null); if (table != null && !sb.tables.hasHeap(table)) table = null; - String counts = post.get("count", null); - int maxcount = (counts == null || counts.equals("all")) ? Integer.MAX_VALUE : Integer.parseInt(counts); - String pattern = post.get("search", ""); - Pattern matcher = (pattern.length() == 0 || pattern.equals(".*")) ? null : Pattern.compile(".*" + pattern + ".*"); - prop.put("pattern", pattern); // show table selection int count = 0; Iterator ti = sb.tables.tables(); String tablename; + prop.put("showselection", 1); while (ti.hasNext()) { tablename = ti.next(); - prop.put("tables_" + count + "_name", tablename); - prop.put("tables_" + count + "_selected", (table != null && table.equals(tablename)) ? 1 : 0); + prop.put("showselection_tables_" + count + "_name", tablename); + prop.put("showselection_tables_" + count + "_selected", (table != null && table.equals(tablename)) ? 1 : 0); count++; } - prop.put("tables", count); + prop.put("showselection_tables", count); + prop.put("showselection_pattern", ""); + + if (post == null) return prop; // return rewrite properties + + String counts = post.get("count", null); + int maxcount = (counts == null || counts.equals("all")) ? Integer.MAX_VALUE : Integer.parseInt(counts); + String pattern = post.get("search", ""); + Pattern matcher = (pattern.length() == 0 || pattern.equals(".*")) ? null : Pattern.compile(".*" + pattern + ".*"); + prop.put("pattern", pattern); List columns = null; if (table != null) try { diff --git a/htroot/api/table_p.java b/htroot/api/table_p.java index 4de9e5d3d..48dff143d 100644 --- a/htroot/api/table_p.java +++ b/htroot/api/table_p.java @@ -36,13 +36,15 @@ public class table_p { final serverObjects prop = new serverObjects(); String table = (post == null) ? null : post.get("table", null); if (table != null && !sb.tables.hasHeap(table)) table = null; - if (table == null) { - prop.put("showtable", 0); - return prop; - } + prop.put("showtable", 0); + + if (table == null) return prop; boolean showpk = post.containsKey("pk"); + String selectKey = post.containsKey("selectKey") ? post.get("selectKey") : null; + String selectValue = (selectKey != null && post.containsKey("selectValue")) ? post.get("selectValue") : null; + ArrayList columns = null; try { columns = sb.tables.columns(table); @@ -86,8 +88,8 @@ public class table_p { final Iterator mapIterator = sb.tables.orderByPK(plainIterator, maxCount).iterator(); Tables.Row trow; boolean dark = true; - byte[] cell; - while ((mapIterator.hasNext()) && (count < maxCount)) { + String cellName, cellValue; + rowloop: while ((mapIterator.hasNext()) && (count < maxCount)) { trow = mapIterator.next(); if (row == null) continue; prop.put("showtable_list_" + count + "_dark", ((dark) ? 1 : 0) ); dark=!dark; @@ -95,9 +97,13 @@ public class table_p { prop.put("showtable_list_" + count + "_showpk_pk", new String(trow.getPK())); prop.put("showtable_list_" + count + "_count", count); for (int i = 0; i < columns.size(); i++) { - cell = trow.get(columns.get(i)); - prop.putHTML("showtable_list_" + count + "_columns_" + i + "_column", columns.get(i)); - prop.putHTML("showtable_list_" + count + "_columns_" + i + "_cell", cell == null ? "" : new String(cell)); + cellName = columns.get(i); + cellValue = new String(trow.get(cellName)); + if (selectKey != null && cellName.equals(selectKey) && !cellValue.matches(selectValue)) { + continue rowloop; + } + prop.putHTML("showtable_list_" + count + "_columns_" + i + "_column", cellName); + prop.putHTML("showtable_list_" + count + "_columns_" + i + "_cell", cellValue); } prop.put("showtable_list_" + count + "_columns", columns.size()); diff --git a/source/de/anomic/search/Segment.java b/source/de/anomic/search/Segment.java index 751db3ff2..ac1fc4fb7 100644 --- a/source/de/anomic/search/Segment.java +++ b/source/de/anomic/search/Segment.java @@ -361,7 +361,7 @@ public class Segment { document.outboundLinks(), // outbound links searchEvent // a search event that can have results directly ); - + final long indexingEndTime = System.currentTimeMillis(); if (log.isInfo()) { diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 2159624ce..b9824b267 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1903,6 +1903,23 @@ public final class Switchboard extends serverSwitch { return; } + // store rss feeds in document into rss table + for (Map.Entry rssEntry : document.getRSS().entrySet()) { + Tables.Data rssRow = new Tables.Data(); + rssRow.put("referrer", queueEntry.url().hash()); + rssRow.put("url", rssEntry.getKey().toNormalform(true, false).getBytes()); + rssRow.put("title", rssEntry.getValue().getBytes()); + rssRow.put("recording_date", new Date()); + //rssRow.put("last_load_date", "".getBytes()); + //rssRow.put("last_load_count", "".getBytes()); + //rssRow.put("avg_upd_per_day", "".getBytes()); + try { + this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow); + } catch (IOException e) { + Log.logException(e); + } + } + // update url result list statistics crawlResults.stack( newEntry, // loaded url db entry @@ -1970,17 +1987,14 @@ public final class Switchboard extends serverSwitch { */ public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) throws IOException, Parser.Failure { final Segments.Process process = Segments.Process.LOCALCRAWLING; - if (indexSegments.segment(process).urlMetadata.exists(url.hash())) { - searchEvent.addHeuristic(url.hash(), heuristicName, true); - return; // don't do double-work - } + if (searchEvent != null) searchEvent.addHeuristic(url.hash(), heuristicName, true); + if (indexSegments.segment(process).urlMetadata.exists(url.hash())) return; // don't do double-work final Request request = loader.request(url, true, true); String acceptedError = this.crawlStacker.checkAcceptance(url, this.crawler.profilesActiveCrawls.getEntry(request.profileHandle()), 0); if (acceptedError != null) { - log.logInfo("Heuristic: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); + log.logWarning("addToIndex: cannot load " + url.toNormalform(false, false) + ": " + acceptedError); return; } - searchEvent.addHeuristic(url.hash(), heuristicName, false); new Thread() {public void run() { try { Response response = loader.load(request, CacheStrategy.IFFRESH, Long.MAX_VALUE); @@ -1994,12 +2008,12 @@ public final class Switchboard extends serverSwitch { ResultImages.registerImages(url, document, true); webStructure.generateCitationReference(url, document, condenser, response.lastModified()); storeDocumentIndex(process, response, document, condenser, searchEvent); - log.logInfo("heuristic fill of url " + url.toNormalform(true, true) + " finished"); + log.logInfo("addToIndex fill of url " + url.toNormalform(true, true) + " finished"); } } catch (IOException e) { - //Log.logException(e); + log.logWarning("addToIndex: failed loading " + url.toNormalform(false, false) + ": " + e.getMessage()); } catch (Parser.Failure e) { - //Log.logException(e); + log.logWarning("addToIndex: failed parsing " + url.toNormalform(false, false) + ": " + e.getMessage()); } }}.start(); } diff --git a/source/net/yacy/cora/document/RSSFeed.java b/source/net/yacy/cora/document/RSSFeed.java index 3ba9bd717..bc5cead92 100644 --- a/source/net/yacy/cora/document/RSSFeed.java +++ b/source/net/yacy/cora/document/RSSFeed.java @@ -26,7 +26,7 @@ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.Map; -public class RSSFeed implements Iterable { +public class RSSFeed implements Iterable { public static final int DEFAULT_MAXSIZE = 1000; @@ -78,7 +78,7 @@ public class RSSFeed implements Iterable { return messages.size(); } - public Iterator iterator() { + public Iterator iterator() { return new messageIterator(); } @@ -92,7 +92,7 @@ public class RSSFeed implements Iterable { } } - public class messageIterator implements Iterator{ + public class messageIterator implements Iterator{ Iterator GUIDiterator; String lastGUID; diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 6341eafde..ab1f657af 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -33,9 +33,6 @@ import net.yacy.kelondro.util.DateFormatter; public class RSSMessage implements Hit { - // statics for item generation and automatic categorization - private static int guidcount = 0; - public static enum Token { title("title"), @@ -73,7 +70,8 @@ public class RSSMessage implements Hit { return this.keys; } } - + + private static String artificialGuidPrefix = "c0_"; public static final RSSMessage POISON = new RSSMessage("", "", ""); public static final HashSet tags = new HashSet(); @@ -86,21 +84,25 @@ public class RSSMessage implements Hit { private final Map map; public RSSMessage(final String title, final String description, final String link) { - this(); - setValue("title", title); - setValue("description", description); - setValue("link", link); - setValue("pubDate", DateFormatter.formatShortSecond(new Date())); - setValue("guid", Integer.toHexString((title + description + link).hashCode())); + this.map = new ConcurrentHashMap(); + map.put("title", title); + map.put("description", description); + map.put("link", link); + map.put("pubDate", DateFormatter.formatShortSecond(new Date())); + map.put("guid", artificialGuidPrefix + Integer.toHexString((title + description + link).hashCode())); } public RSSMessage() { this.map = new ConcurrentHashMap(); - this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++); } public void setValue(final String name, final String value) { map.put(name, value); + // if possible generate a guid if not existent so far + if ((name.equals("title") || name.equals("description") || name.equals("link")) && + (!map.containsKey("guid") || map.get("guid").startsWith(artificialGuidPrefix))) { + map.put("guid", artificialGuidPrefix + Integer.toHexString((getTitle() + getDescription() + getLink()).hashCode())); + } } public String getTitle() { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 36b54351e..5ee74b72b 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -67,6 +67,7 @@ public class Document { private final StringBuilder description; // an abstract, if present: short content description private Object text; // the clear text, all that is visible private final Map anchors; // all links embedded as clickeable entities (anchor tags) + private final Map rss; // all embedded rss feeds private final HashMap images; // all visible pictures in document // the anchors and images - Maps are URL-to-EntityDescription mappings. // The EntityDescription appear either as visible text in anchors or as alternative @@ -83,7 +84,10 @@ public class Document { public Document(final MultiProtocolURI location, final String mimeType, final String charset, final Set languages, final String[] keywords, final String title, final String author, final String publisher, final String[] sections, final String abstrct, - final Object text, final Map anchors, final HashMap images, + final Object text, + final Map anchors, + final Map rss, + final HashMap images, boolean indexingDenied) { this.source = location; this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType; @@ -94,6 +98,7 @@ public class Document { this.sections = (sections == null) ? new LinkedList() : Arrays.asList(sections); this.description = (abstrct == null) ? new StringBuilder(0) : new StringBuilder(abstrct); this.anchors = (anchors == null) ? new HashMap(0) : anchors; + this.rss = (rss == null) ? new HashMap(0) : rss; this.images = (images == null) ? new HashMap() : images; this.publisher = publisher; this.hyperlinks = null; @@ -287,6 +292,12 @@ dc_rights return anchors; } + public Map getRSS() { + // returns all links embedded as anchors (clickeable entities) + // this is a url(String)/text(String) map + return rss; + } + // the next three methods provide a calculated view on the getAnchors/getImages: @@ -504,8 +515,9 @@ dc_rights this.text = new ByteArrayOutputStream(); } FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text); - + anchors.putAll(doc.getAnchors()); + rss.putAll(doc.getRSS()); ContentScraper.addAllImages(images, doc.getImages()); } } @@ -618,8 +630,9 @@ dc_rights final StringBuilder title = new StringBuilder(); final StringBuilder description = new StringBuilder(); final LinkedList sectionTitles = new LinkedList(); - + final Map anchors = new HashMap(); + final Map rss = new HashMap(); final HashMap images = new HashMap(); for (Document doc: docs) { @@ -659,6 +672,7 @@ dc_rights } } anchors.putAll(doc.getAnchors()); + rss.putAll(doc.getRSS()); ContentScraper.addAllImages(images, doc.getImages()); } return new Document( @@ -674,6 +688,7 @@ dc_rights description.toString(), content.getBytes(), anchors, + rss, images, false); } diff --git a/source/net/yacy/document/content/DCEntry.java b/source/net/yacy/document/content/DCEntry.java index d1a4565b2..3c0e41c9a 100644 --- a/source/net/yacy/document/content/DCEntry.java +++ b/source/net/yacy/document/content/DCEntry.java @@ -258,6 +258,7 @@ public class DCEntry extends TreeMap { getDescription().getBytes("UTF-8"), null, null, + null, false); } catch (UnsupportedEncodingException e) { Log.logException(e); diff --git a/source/net/yacy/document/parser/csvParser.java b/source/net/yacy/document/parser/csvParser.java index cea903d45..521243e63 100644 --- a/source/net/yacy/document/parser/csvParser.java +++ b/source/net/yacy/document/parser/csvParser.java @@ -68,6 +68,7 @@ public class csvParser extends AbstractParser implements Parser { sb.toString().getBytes(charset), null, null, + null, false)}; } catch (UnsupportedEncodingException e) { throw new Parser.Failure("error in csvParser, getBytes: " + e.getMessage(), location); diff --git a/source/net/yacy/document/parser/docParser.java b/source/net/yacy/document/parser/docParser.java index 7d33da415..49fcd55e3 100644 --- a/source/net/yacy/document/parser/docParser.java +++ b/source/net/yacy/document/parser/docParser.java @@ -98,6 +98,7 @@ public class docParser extends AbstractParser implements Parser { contents.toString().getBytes("UTF-8"), null, null, + null, false)}; } catch (UnsupportedEncodingException e) { throw new Parser.Failure("error in docParser, getBytes: " + e.getMessage(), location); diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 203c7cbde..10089b08b 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -79,6 +79,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { } // class variables: collectors for links + private HashMap rss; private HashMap anchors; private HashMap images; // urlhash/image relation private final HashMap metas; @@ -104,6 +105,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { // it is only the reference for relative links super(linkTags0, linkTags1); this.root = root; + this.rss = new HashMap(); this.anchors = new HashMap(); this.images = new HashMap(); this.metas = new HashMap(); @@ -188,14 +190,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { final MultiProtocolURI newLink = absolutePath(tagopts.getProperty("href", "")); if (newLink != null) { - final String type = tagopts.getProperty("rel", ""); + final String rel = tagopts.getProperty("rel", ""); final String linktitle = tagopts.getProperty("title", ""); + final String type = tagopts.getProperty("type", ""); - if (type.equalsIgnoreCase("shortcut icon")) { + if (rel.equalsIgnoreCase("shortcut icon")) { final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); images.put(ie.url(), ie); this.favicon = newLink; - } else if (!type.equalsIgnoreCase("stylesheet") && !type.equalsIgnoreCase("alternate stylesheet")) { + } else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) { + rss.put(newLink, linktitle); + } else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) { anchors.put(newLink, linktitle); } } @@ -355,6 +360,11 @@ public class ContentScraper extends AbstractScraper implements Scraper { return anchors; } + public Map getRSS() { + // returns a url (String) / name (String) relation + return rss; + } + /** * get all images * @return a map of diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 1a8a284b4..46fd08481 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -142,6 +142,7 @@ public class htmlParser extends AbstractParser implements Parser { scraper.getDescription(), scraper.getText(), scraper.getAnchors(), + scraper.getRSS(), scraper.getImages(), scraper.indexingDenied())}; //scraper.close(); diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 9d21be5a5..49ceff670 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -199,6 +199,7 @@ public class genericImageParser extends AbstractParser implements Parser { description == null ? "" : description, // description infoString.getBytes(), // content text anchors, // anchors + null, images, false)}; // images } diff --git a/source/net/yacy/document/parser/odtParser.java b/source/net/yacy/document/parser/odtParser.java index 29651c4d3..8f77a6564 100644 --- a/source/net/yacy/document/parser/odtParser.java +++ b/source/net/yacy/document/parser/odtParser.java @@ -169,6 +169,7 @@ public class odtParser extends AbstractParser implements Parser { contentBytes, null, null, + null, false)}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/ooxmlParser.java b/source/net/yacy/document/parser/ooxmlParser.java index 376dd0a01..04a0f1a3f 100644 --- a/source/net/yacy/document/parser/ooxmlParser.java +++ b/source/net/yacy/document/parser/ooxmlParser.java @@ -158,6 +158,7 @@ public class ooxmlParser extends AbstractParser implements Parser { contentBytes, null, null, + null, false)}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 55adbbbe3..61f1383eb 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -157,6 +157,7 @@ public class pdfParser extends AbstractParser implements Parser { contentBytes, null, null, + null, false)}; } diff --git a/source/net/yacy/document/parser/pptParser.java b/source/net/yacy/document/parser/pptParser.java index 2400951de..c88c5a84e 100644 --- a/source/net/yacy/document/parser/pptParser.java +++ b/source/net/yacy/document/parser/pptParser.java @@ -95,6 +95,7 @@ public class pptParser extends AbstractParser implements Parser { contents.getBytes("UTF-8"), null, null, + null, false)}; return docs; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/psParser.java b/source/net/yacy/document/parser/psParser.java index 7bcb19887..b881944d4 100644 --- a/source/net/yacy/document/parser/psParser.java +++ b/source/net/yacy/document/parser/psParser.java @@ -111,6 +111,7 @@ public class psParser extends AbstractParser implements Parser { null, // abstract outputFile, // fulltext null, // anchors + null, // rss null, // images false)}; // indexingdenied diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 41bc6e1cd..b38b4b57b 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -87,6 +87,7 @@ public class rssParser extends AbstractParser implements Parser { item.getDescription(), null, anchors, + null, new HashMap(), false); docs.add(doc); diff --git a/source/net/yacy/document/parser/rtfParser.java b/source/net/yacy/document/parser/rtfParser.java index 652a70162..e1924bd49 100644 --- a/source/net/yacy/document/parser/rtfParser.java +++ b/source/net/yacy/document/parser/rtfParser.java @@ -79,6 +79,7 @@ public class rtfParser extends AbstractParser implements Parser { bodyText.getBytes("UTF-8"), null, null, + null, false)}; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index 61ea53918..f22ddc8b2 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -56,7 +56,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } public Document parse(final MultiProtocolURI location, final String mimeType, final String charset, final IInStream source) throws Parser.Failure, InterruptedException { - final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, false); + final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, null, (Object)null, null, null, null, false); Handler archive; super.log.logFine("opening 7zip archive..."); try { diff --git a/source/net/yacy/document/parser/swfParser.java b/source/net/yacy/document/parser/swfParser.java index aac095016..6af4a9ea9 100644 --- a/source/net/yacy/document/parser/swfParser.java +++ b/source/net/yacy/document/parser/swfParser.java @@ -65,7 +65,6 @@ public class swfParser extends AbstractParser implements Parser { Log.logException(e); throw new Parser.Failure(e.getMessage(), location); } catch (IOException e) { - Log.logException(e); throw new Parser.Failure(e.getMessage(), location); } catch (Exception e) { Log.logException(e); @@ -117,6 +116,7 @@ public class swfParser extends AbstractParser implements Parser { contents.getBytes("UTF-8"), // the parsed document text anchors, // a map of extracted anchors null, + null, false)}; // a treeset of image URLs } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 8395de7ec..55d55648e 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -97,6 +97,7 @@ public class torrentParser extends AbstractParser implements Parser { filenames.toString().getBytes(charset), null, null, + null, false)}; } catch (UnsupportedEncodingException e) { throw new Parser.Failure("error in torrentParser, getBytes: " + e.getMessage(), location); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 53ba78860..266dea507 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -212,6 +212,7 @@ public class vcfParser extends AbstractParser implements Parser { "vCard", // an abstract text, // the parsed document text anchors, // a map of extracted anchors + null, null, // a treeset of image URLs false)}; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/vsdParser.java b/source/net/yacy/document/parser/vsdParser.java index 33c04a0e8..ddf0d476f 100644 --- a/source/net/yacy/document/parser/vsdParser.java +++ b/source/net/yacy/document/parser/vsdParser.java @@ -113,6 +113,7 @@ public class vsdParser extends AbstractParser implements Parser { abstrct, // an abstract contents.getBytes("UTF-8"), // the parsed document text null, // a map of extracted anchors + null, null, // a treeset of image URLs false)}; } catch (final Exception e) { diff --git a/source/net/yacy/document/parser/xlsParser.java b/source/net/yacy/document/parser/xlsParser.java index 63d9be29d..75433d1a2 100644 --- a/source/net/yacy/document/parser/xlsParser.java +++ b/source/net/yacy/document/parser/xlsParser.java @@ -125,6 +125,7 @@ public class xlsParser extends AbstractParser implements Parser { contents.getBytes("UTF-8"), null, null, + null, false)}; } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index 09e8465f6..fd0971542 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -362,7 +362,7 @@ public class Tables { } - public class Data extends LinkedHashMap { + public static class Data extends LinkedHashMap { private static final long serialVersionUID = 978426054043749337L;