simplify rssreader and improve atom feed link extraction

- type detection (rss/atom) 
    - init type parameter overwritten during parse, parameter obsolete
    - detection by endtag changed to simpler first-tag evaluation
- channel image not used, removed related extra parser handling
    - remove unused code (set/getImage) in rssfeed
- atom link extraction to account for possible multipe link tags
   - spec limits link to one with rel="alternate" or one without rel attribute
     not accounting for the follwing type & hreflang exception yet:

   o  atom:entry elements MUST NOT contain more than one atom:link
      element with a rel attribute value of "alternate" that has the
      same combination of type and hreflang attribute values.
pull/1/head
reger 11 years ago
parent a8508417d1
commit e9eae45b55

@ -37,7 +37,6 @@ public class RSSFeed implements Iterable<RSSMessage> {
// class variables
private RSSMessage channel;
private String imageURL;
private final Map<String, RSSMessage> messages; // a guid:Item map
private final int maxsize;
@ -73,14 +72,6 @@ public class RSSFeed implements Iterable<RSSMessage> {
return this.channel;
}
public void setImage(final String imageURL) {
this.imageURL = imageURL;
}
public String getImage() {
return this.imageURL;
}
public Set<MultiProtocolURL> getLinks() {
Set<MultiProtocolURL> links = new HashSet<MultiProtocolURL>();
for (RSSMessage message: this.messages.values()) {

@ -45,7 +45,7 @@ public class RSSReader extends DefaultHandler {
// class variables
private RSSMessage item;
private final StringBuilder buffer;
private boolean parsingChannel, parsingImage, parsingItem;
private boolean parsingChannel, parsingItem;
private final RSSFeed theChannel;
private Type type;
@ -56,38 +56,36 @@ public class RSSReader extends DefaultHandler {
this.buffer = new StringBuilder(300);
this.item = null;
this.parsingChannel = false;
this.parsingImage = false;
this.parsingItem = false;
this.type = Type.none;
}
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public RSSReader(final int maxsize, InputStream stream, final Type type) throws IOException {
public RSSReader(final int maxsize, InputStream stream) throws IOException {
this(maxsize);
this.type = type;
if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) stream = new BufferedInputStream(stream);
try {
final SAXParser saxParser = getParser();
// do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html
saxParser.getXMLReader().setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(final String arg0, final String arg1)
throws SAXException, IOException {
return new InputSource(new StringReader(""));
}
@Override
public InputSource resolveEntity(final String arg0, final String arg1)
throws SAXException, IOException {
return new InputSource(new StringReader(""));
}
});
saxParser.parse(stream, this);
} catch (final SAXException e) {
@ -106,15 +104,7 @@ public class RSSReader extends DefaultHandler {
throw new IOException("response=null");
}
if (a.length < 100) {
throw new IOException("response=" + UTF8.String(a));
}
if (!equals(a, UTF8.getBytes("<?xml")) && !equals(a, UTF8.getBytes("<rss"))) {
throw new IOException("response does not contain valid xml");
}
final Type type = findOutType(a);
if (type == Type.none) {
throw new IOException("response incomplete");
throw new IOException("response to short=" + UTF8.String(a));
}
// make input stream
@ -123,7 +113,7 @@ public class RSSReader extends DefaultHandler {
// parse stream
RSSReader reader = null;
try {
reader = new RSSReader(maxsize, bais, type);
reader = new RSSReader(maxsize, bais);
} catch (final Exception e) {
throw new IOException("parse exception: " + e.getMessage(), e);
}
@ -131,42 +121,6 @@ public class RSSReader extends DefaultHandler {
return reader;
}
/**
* Tries to find out the type of feed by stepping through its data
* starting in the end and looking at the last XML tag. Just grabbing
* the last few characters of the data does not work since some
* people add quite long comments at the end of their feeds.
* @param a contains the feed
* @return type of feed
*/
private static Type findOutType(final byte[] a) {
String end;
int i = 1;
do {
/* In first iteration grab the last 80 characters, after that
* move towards the start of the data and take some more (90)
* to have an overlap in order to not miss anything if the tag
* is on the border of two 80 character blocks.
*/
end = UTF8.String(a, a.length - (i * 80), (80 + ((i > 1)? 10 : 0)));
i++;
} while(!end.contains("</"));
Type type = Type.none;
if (end.indexOf("rss",0) > 0) type = Type.rss;
if (end.indexOf("feed",0) > 0) type = Type.atom;
if (end.indexOf("rdf",0) > 0) type = Type.rdf;
return type;
}
private final static boolean equals(final byte[] buffer, final byte[] pattern) {
// compares two byte arrays: true, if pattern appears completely at offset position
if (buffer.length < pattern.length) return false;
for (int i = 0; i < pattern.length; i++) if (buffer[i] != pattern[i]) return false;
return true;
}
@Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("channel".equals(tag)) {
@ -185,16 +139,17 @@ public class RSSReader extends DefaultHandler {
}
this.item = new RSSMessage();
this.parsingItem = true;
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("type") == null || this.item.getLink().length() == 0 || atts.getValue("type").startsWith("text") || atts.getValue("type").equals("application/xhtml+xml"))) {
} else if (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("rel") == null || atts.getValue("rel").equals("alternate"))) {
// atom link handling (rss link is handled in endElement)
final String url = atts.getValue("href");
if (url != null && url.length() > 0) this.item.setValue(Token.link, url);
} else if ("image".equals(tag) || (this.parsingItem && this.type == Type.atom && "link".equals(tag) && (atts.getValue("type") == null || atts.getValue("type").startsWith("image")))) {
this.parsingImage = true;
} else if ("rss".equals(tag)) {
this.type = Type.rss;
}
}
@Override
public void endElement(final String uri, final String name, final String tag) {
public void endElement(final String uri, final String name, final String tag) throws SAXException {
if (tag == null) return;
if ("channel".equals(tag) || "feed".equals(tag)) {
if (this.parsingChannel) this.theChannel.setChannel(this.item);
@ -202,12 +157,6 @@ public class RSSReader extends DefaultHandler {
} else if ("item".equals(tag) || "entry".equals(tag)) {
this.theChannel.addMessage(this.item);
this.parsingItem = false;
} else if ("image".equals(tag)) {
this.parsingImage = false;
} else if ((this.parsingImage) && (this.parsingChannel)) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if ("url".equals(tag)) this.theChannel.setImage(value);
} else if (this.parsingItem) {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
@ -216,6 +165,9 @@ public class RSSReader extends DefaultHandler {
final String value = this.buffer.toString().trim();
this.buffer.setLength(0);
if (RSSMessage.tags.contains(tag)) this.item.setValue(RSSMessage.valueOfNick(tag), value);
} else if (this.type == Type.none) {
// give up if we don't known the feed format
throw new SAXException("response incomplete or unknown feed format");
}
}

@ -63,7 +63,7 @@ public class rssParser extends AbstractParser implements Parser {
throws Failure, InterruptedException {
RSSReader rssReader;
try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source, RSSReader.Type.none);
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source);
} catch (final IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url, e);
}

Loading…
Cancel
Save