From e9eae45b55981df6061147b142bf84e6cd0c1de9 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 10 Aug 2014 01:29:16 +0200 Subject: [PATCH] simplify rssreader and improve atom feed link extraction - type detection (rss/atom) - init type parameter overwritten during parse, parameter obsolete - detection by endtag changed to simpler first-tag evaluation - channel image not used, removed related extra parser handling - remove unused code (set/getImage) in rssfeed - atom link extraction to account for possible multipe link tags - spec limits link to one with rel="alternate" or one without rel attribute not accounting for the follwing type & hreflang exception yet: o atom:entry elements MUST NOT contain more than one atom:link element with a rel attribute value of "alternate" that has the same combination of type and hreflang attribute values. --- .../net/yacy/cora/document/feed/RSSFeed.java | 9 -- .../yacy/cora/document/feed/RSSReader.java | 102 +++++------------- .../net/yacy/document/parser/rssParser.java | 2 +- 3 files changed, 28 insertions(+), 85 deletions(-) diff --git a/source/net/yacy/cora/document/feed/RSSFeed.java b/source/net/yacy/cora/document/feed/RSSFeed.java index 45a33514f..009c500ea 100644 --- a/source/net/yacy/cora/document/feed/RSSFeed.java +++ b/source/net/yacy/cora/document/feed/RSSFeed.java @@ -37,7 +37,6 @@ public class RSSFeed implements Iterable { // class variables private RSSMessage channel; - private String imageURL; private final Map messages; // a guid:Item map private final int maxsize; @@ -73,14 +72,6 @@ public class RSSFeed implements Iterable { return this.channel; } - public void setImage(final String imageURL) { - this.imageURL = imageURL; - } - - public String getImage() { - return this.imageURL; - } - public Set getLinks() { Set links = new HashSet(); for (RSSMessage message: this.messages.values()) { diff --git a/source/net/yacy/cora/document/feed/RSSReader.java b/source/net/yacy/cora/document/feed/RSSReader.java index 1feb51f5c..ec8fbfde9 100644 --- a/source/net/yacy/cora/document/feed/RSSReader.java +++ b/source/net/yacy/cora/document/feed/RSSReader.java @@ -45,7 +45,7 @@ public class RSSReader extends DefaultHandler { // class variables private RSSMessage item; private final StringBuilder buffer; - private boolean parsingChannel, parsingImage, parsingItem; + private boolean parsingChannel, parsingItem; private final RSSFeed theChannel; private Type type; @@ -56,38 +56,36 @@ public class RSSReader extends DefaultHandler { this.buffer = new StringBuilder(300); this.item = null; this.parsingChannel = false; - this.parsingImage = false; this.parsingItem = false; this.type = Type.none; } private static final ThreadLocal tlSax = new ThreadLocal(); private static SAXParser getParser() throws SAXException { - SAXParser parser = tlSax.get(); - if (parser == null) { - try { - parser = SAXParserFactory.newInstance().newSAXParser(); - } catch (final ParserConfigurationException e) { - throw new SAXException(e.getMessage(), e); - } - tlSax.set(parser); - } - return parser; + SAXParser parser = tlSax.get(); + if (parser == null) { + try { + parser = SAXParserFactory.newInstance().newSAXParser(); + } catch (final ParserConfigurationException e) { + throw new SAXException(e.getMessage(), e); + } + tlSax.set(parser); + } + return parser; } - public RSSReader(final int maxsize, InputStream stream, final Type type) throws IOException { + public RSSReader(final int maxsize, InputStream stream) throws IOException { this(maxsize); - this.type = type; if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) stream = new BufferedInputStream(stream); try { final SAXParser saxParser = getParser(); // do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html saxParser.getXMLReader().setEntityResolver(new EntityResolver() { - @Override - public InputSource resolveEntity(final String arg0, final String arg1) - throws SAXException, IOException { - return new InputSource(new StringReader("")); - } + @Override + public InputSource resolveEntity(final String arg0, final String arg1) + throws SAXException, IOException { + return new InputSource(new StringReader("")); + } }); saxParser.parse(stream, this); } catch (final SAXException e) { @@ -106,15 +104,7 @@ public class RSSReader extends DefaultHandler { throw new IOException("response=null"); } if (a.length < 100) { - throw new IOException("response=" + UTF8.String(a)); - } - if (!equals(a, UTF8.getBytes(" 1)? 10 : 0))); - i++; - } while(!end.contains(" 0) type = Type.rss; - if (end.indexOf("feed",0) > 0) type = Type.atom; - if (end.indexOf("rdf",0) > 0) type = Type.rdf; - return type; - } - - private final static boolean equals(final byte[] buffer, final byte[] pattern) { - // compares two byte arrays: true, if pattern appears completely at offset position - if (buffer.length < pattern.length) return false; - for (int i = 0; i < pattern.length; i++) if (buffer[i] != pattern[i]) return false; - return true; - } - @Override public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { if ("channel".equals(tag)) { @@ -185,16 +139,17 @@ public class RSSReader extends DefaultHandler { } this.item = new RSSMessage(); this.parsingItem = true; - } else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("type") == null || this.item.getLink().length() == 0 || atts.getValue("type").startsWith("text") || atts.getValue("type").equals("application/xhtml+xml"))) { + } else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) { + // atom link handling (rss link is handled in endElement) final String url = atts.getValue("href"); if (url != null && url.length() > 0) this.item.setValue(Token.link, url); - } else if ("image".equals(tag) || (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("type") == null || atts.getValue("type").startsWith("image")))) { - this.parsingImage = true; + } else if ("rss".equals(tag)) { + this.type = Type.rss; } } @Override - public void endElement(final String uri, final String name, final String tag) { + public void endElement(final String uri, final String name, final String tag) throws SAXException { if (tag == null) return; if ("channel".equals(tag) || "feed".equals(tag)) { if (this.parsingChannel) this.theChannel.setChannel(this.item); @@ -202,12 +157,6 @@ public class RSSReader extends DefaultHandler { } else if ("item".equals(tag) || "entry".equals(tag)) { this.theChannel.addMessage(this.item); this.parsingItem = false; - } else if ("image".equals(tag)) { - this.parsingImage = false; - } else if ((this.parsingImage) && (this.parsingChannel)) { - final String value = this.buffer.toString().trim(); - this.buffer.setLength(0); - if ("url".equals(tag)) this.theChannel.setImage(value); } else if (this.parsingItem) { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); @@ -216,6 +165,9 @@ public class RSSReader extends DefaultHandler { final String value = this.buffer.toString().trim(); this.buffer.setLength(0); if (RSSMessage.tags.contains(tag)) this.item.setValue(RSSMessage.valueOfNick(tag), value); + } else if (this.type == Type.none) { + // give up if we don't known the feed format + throw new SAXException("response incomplete or unknown feed format"); } } diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 07e2d3770..9d88faad6 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -63,7 +63,7 @@ public class rssParser extends AbstractParser implements Parser { throws Failure, InterruptedException { RSSReader rssReader; try { - rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source, RSSReader.Type.none); + rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); } catch (final IOException e) { throw new Parser.Failure("Load error:" + e.getMessage(), url, e); }