added ability in rss reader to parse atom feeds

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7094 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 5de70c3d7c
commit 3988a95fb5

@ -35,10 +35,10 @@ public class RSSMessage implements Hit {
public static enum Token { public static enum Token {
title("title"), title("title,atom:title,rss:title"),
link("link"), link("link,rss:link,atom:link"),
description("description"), description("description,rss:description,subtitle,atom:subtitle"),
pubDate("pubDate,lastBuildDate"), pubDate("pubDate,lastBuildDate,rss:lastBuildDate,updated,rss:updated"),
copyright("copyright,dc:publisher,publisher"), copyright("copyright,dc:publisher,publisher"),
author("author,dc:creator,creator"), author("author,dc:creator,creator"),
subject("subject,dc:subject"), subject("subject,dc:subject"),

@ -40,6 +40,9 @@ public class RSSReader extends DefaultHandler {
private final StringBuilder buffer; private final StringBuilder buffer;
private boolean parsingChannel, parsingImage, parsingItem; private boolean parsingChannel, parsingImage, parsingItem;
private final RSSFeed theChannel; private final RSSFeed theChannel;
private Type type;
public enum Type { rss, atom, none };
private RSSReader(int maxsize) { private RSSReader(int maxsize) {
theChannel = new RSSFeed(maxsize); theChannel = new RSSFeed(maxsize);
@ -48,6 +51,7 @@ public class RSSReader extends DefaultHandler {
parsingChannel = false; parsingChannel = false;
parsingImage = false; parsingImage = false;
parsingItem = false; parsingItem = false;
type = Type.none;
} }
public RSSReader(int maxsize, final String path) throws IOException { public RSSReader(int maxsize, final String path) throws IOException {
@ -63,8 +67,9 @@ public class RSSReader extends DefaultHandler {
} }
} }
public RSSReader(int maxsize, final InputStream stream) throws IOException { public RSSReader(int maxsize, final InputStream stream, Type type) throws IOException {
this(maxsize); this(maxsize);
this.type = type;
final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParserFactory factory = SAXParserFactory.newInstance();
try { try {
final SAXParser saxParser = factory.newSAXParser(); final SAXParser saxParser = factory.newSAXParser();
@ -76,6 +81,10 @@ public class RSSReader extends DefaultHandler {
} }
} }
public Type getType() {
return this.type;
}
public static RSSReader parse(int maxsize, final byte[] a) throws IOException { public static RSSReader parse(int maxsize, final byte[] a) throws IOException {
// check integrity of array // check integrity of array
@ -89,7 +98,10 @@ public class RSSReader extends DefaultHandler {
throw new IOException("response does not contain valid xml"); throw new IOException("response does not contain valid xml");
} }
final String end = new String(a, a.length - 10, 10); final String end = new String(a, a.length - 10, 10);
if (end.indexOf("rss") < 0) { Type type = Type.none;
if (end.indexOf("rss") > 0) type = Type.rss;
if (end.indexOf("feed") > 0) type = Type.atom;
if (type == Type.none) {
throw new IOException("response incomplete"); throw new IOException("response incomplete");
} }
@ -99,7 +111,7 @@ public class RSSReader extends DefaultHandler {
// parse stream // parse stream
RSSReader reader = null; RSSReader reader = null;
try { try {
reader = new RSSReader(maxsize, bais); reader = new RSSReader(maxsize, bais, type);
} catch (final Exception e) { } catch (final Exception e) {
throw new IOException("parse exception: " + e.getMessage(), e); throw new IOException("parse exception: " + e.getMessage(), e);
} }
@ -117,9 +129,14 @@ public class RSSReader extends DefaultHandler {
@Override @Override
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("channel".equals(tag)) { if ("channel".equals(tag)) {
this.type = Type.rss;
item = new RSSMessage(); item = new RSSMessage();
parsingChannel = true; parsingChannel = true;
} else if ("item".equals(tag)) { } else if ("feed".equals(tag)) {
this.type = Type.atom;
item = new RSSMessage();
parsingChannel = true;
} else if ("item".equals(tag) || "entry".equals(tag)) {
if (parsingChannel) { if (parsingChannel) {
// the channel ends with the first item not with the channel close tag // the channel ends with the first item not with the channel close tag
theChannel.setChannel(item); theChannel.setChannel(item);
@ -127,6 +144,9 @@ public class RSSReader extends DefaultHandler {
} }
item = new RSSMessage(); item = new RSSMessage();
parsingItem = true; parsingItem = true;
} else if (parsingItem && this.type == Type.atom && "link".equals(tag)) {
String url = atts.getValue("href");
if (url != null && url.length() > 0) item.setValue("link", url);
} else if ("image".equals(tag)) { } else if ("image".equals(tag)) {
parsingImage = true; parsingImage = true;
} }
@ -135,10 +155,10 @@ public class RSSReader extends DefaultHandler {
@Override @Override
public void endElement(final String uri, final String name, final String tag) { public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return; if (tag == null) return;
if ("channel".equals(tag)) { if ("channel".equals(tag) || "feed".equals(tag)) {
if (parsingChannel) theChannel.setChannel(item); if (parsingChannel) theChannel.setChannel(item);
parsingChannel = false; parsingChannel = false;
} else if ("item".equals(tag)) { } else if ("item".equals(tag) || "entry".equals(tag)) {
theChannel.addMessage(item); theChannel.addMessage(item);
parsingItem = false; parsingItem = false;
} else if ("image".equals(tag)) { } else if ("image".equals(tag)) {
@ -150,7 +170,7 @@ public class RSSReader extends DefaultHandler {
} else if (parsingItem) { } else if (parsingItem) {
final String value = buffer.toString().trim(); final String value = buffer.toString().trim();
buffer.setLength(0); buffer.setLength(0);
if (RSSMessage.tags.contains(tag)) item.setValue(tag, value); if (RSSMessage.tags.contains(tag) && value.length() > 0) item.setValue(tag, value);
} else if (parsingChannel) { } else if (parsingChannel) {
final String value = buffer.toString().trim(); final String value = buffer.toString().trim();
buffer.setLength(0); buffer.setLength(0);

@ -56,7 +56,7 @@ public class rssParser extends AbstractParser implements Parser {
public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
RSSReader rssReader; RSSReader rssReader;
try { try {
rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source); rssReader = new RSSReader(RSSFeed.DEFAULT_MAXSIZE, source, RSSReader.Type.none);
} catch (IOException e) { } catch (IOException e) {
throw new Parser.Failure("Load error:" + e.getMessage(), url); throw new Parser.Failure("Load error:" + e.getMessage(), url);
} }

Loading…
Cancel
Save