diff --git a/source/net/yacy/cora/document/feed/RSSFeed.java b/source/net/yacy/cora/document/feed/RSSFeed.java index e7b5aba4c..b9b95be8c 100644 --- a/source/net/yacy/cora/document/feed/RSSFeed.java +++ b/source/net/yacy/cora/document/feed/RSSFeed.java @@ -36,9 +36,16 @@ public class RSSFeed implements Iterable { public static final int DEFAULT_MAXSIZE = 10000; // class variables - private RSSMessage channel = null; // single required element see http://www.rssboard.org/rss-profile#element-channel - private final Map messages; // a guid:Item map + + /** Single required element see http://www.rssboard.org/rss-profile#element-channel */ + private RSSMessage channel = null; + + /** A guid:Item map */ + private final Map messages; private final int maxsize; + + /** Set to true when maxsize messages limit has been exceeded and exceeding messages have been discarded */ + private boolean maxSizeExceeded; @@ -67,6 +74,7 @@ public class RSSFeed implements Iterable { this.messages = Collections.synchronizedMap(new LinkedHashMap()); this.channel = null; this.maxsize = maxsize; + this.maxSizeExceeded = false; } /** @@ -115,7 +123,10 @@ public class RSSFeed implements Iterable { final String guid = item.getGuid(); this.messages.put(guid, item); // in case that the feed is full (size > maxsize) flush the oldest element - while (this.messages.size() > this.maxsize) pollMessage(); + while (this.messages.size() > this.maxsize) { + this.maxSizeExceeded = true; + pollMessage(); + } } public RSSMessage getMessage(final String guid) { @@ -130,6 +141,13 @@ public class RSSFeed implements Iterable { public int size() { return this.messages.size(); } + + /** + * @return true when maxsize messages limit has been exceeded and exceeding messages have been discarded + */ + public boolean isMaxSizeExceeded() { + return this.maxSizeExceeded; + } @Override public Iterator iterator() { diff --git a/source/net/yacy/cora/document/feed/RSSReader.java b/source/net/yacy/cora/document/feed/RSSReader.java index fe92436e7..2634d6cf4 100644 --- a/source/net/yacy/cora/document/feed/RSSReader.java +++ b/source/net/yacy/cora/document/feed/RSSReader.java @@ -30,14 +30,16 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.feed.RSSMessage.Token; - import org.xml.sax.Attributes; import org.xml.sax.EntityResolver; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import net.yacy.cora.document.feed.RSSMessage.Token; +import net.yacy.cora.util.StreamLimitException; +import net.yacy.cora.util.StrictLimitInputStream; + public class RSSReader extends DefaultHandler { @@ -47,6 +49,9 @@ public class RSSReader extends DefaultHandler { private boolean parsingChannel, parsingItem; private final RSSFeed theChannel; private Type type; + + /** When a parsing limit on instance construction has been exceeded */ + private boolean maxBytesExceeded; public enum Type { rss, atom, rdf, none } @@ -57,6 +62,7 @@ public class RSSReader extends DefaultHandler { this.parsingChannel = false; this.parsingItem = false; this.type = Type.none; + this.maxBytesExceeded = false; } private static final ThreadLocal tlSax = new ThreadLocal(); @@ -91,6 +97,33 @@ public class RSSReader extends DefaultHandler { throw new IOException (e.getMessage()); } } + + public RSSReader(final int maxsize, final long maxBytes, InputStream stream) throws IOException { + this(maxsize); + + if (!(stream instanceof ByteArrayInputStream) && !(stream instanceof BufferedInputStream)) { + stream = new BufferedInputStream(stream); + } + + StrictLimitInputStream limitedSource = new StrictLimitInputStream(stream, maxBytes); + + try { + final SAXParser saxParser = getParser(); + // do not look at external dtd - see: http://www.ibm.com/developerworks/xml/library/x-tipcfsx/index.html + saxParser.getXMLReader().setEntityResolver(new EntityResolver() { + @Override + public InputSource resolveEntity(final String arg0, final String arg1) + throws SAXException, IOException { + return new InputSource(new StringReader("")); + } + }); + saxParser.parse(limitedSource, this); + } catch (final SAXException e) { + throw new IOException (e.getMessage()); + } catch(StreamLimitException e) { + this.maxBytesExceeded = true; + } + } public Type getType() { return this.type; @@ -177,5 +210,12 @@ public class RSSReader extends DefaultHandler { public RSSFeed getFeed() { return this.theChannel; } + + /** + * @return true when a parsing limit on instance construction has been exceeded + */ + public boolean isMaxBytesExceeded() { + return this.maxBytesExceeded; + } } \ No newline at end of file diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 5925d3acd..2f0d86570 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -955,6 +955,7 @@ dc_rights final Set languages = new HashSet<>(); double lon = 0.0d, lat = 0.0d; boolean indexingDenied = false; + boolean partiallyParsed = false; Date date = null; String charset = null; @@ -1015,6 +1016,7 @@ dc_rights if (doc.dc_language() != null) languages.add(doc.dc_language()); indexingDenied |= doc.indexingDenied; + partiallyParsed |= doc.isPartiallyParsed(); } // clean up parser data @@ -1050,6 +1052,7 @@ dc_rights indexingDenied, date); newDoc.setDepth(mindepth); + newDoc.setPartiallyParsed(partiallyParsed); return newDoc; } diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java index 093f4cc77..3b88ff10c 100644 --- a/source/net/yacy/document/parser/rssParser.java +++ b/source/net/yacy/document/parser/rssParser.java @@ -74,17 +74,27 @@ public class rssParser extends AbstractParser implements Parser { throw new Parser.Failure("Load error:" + e.getMessage(), location, e); } - final RSSFeed feed = rssReader.getFeed(); + return rssFeedToDocuments(charset, rssReader.getFeed()); + } + + /** + * Create parsed documents from the given feed. + * @param charset the charset name of the feed, if known + * @param feed the feed instance + * @return an array of documents : a document per feed item + */ + private Document[] rssFeedToDocuments(final String charset, final RSSFeed feed) { //RSSMessage channel = feed.getChannel(); final List docs = new ArrayList(); DigestURL itemuri; Set languages; Document doc; - for (final Hit item: feed) try { - itemuri = new DigestURL(item.getLink()); - languages = new HashSet(); - languages.add(item.getLanguage()); - doc = new Document( + for (final Hit item: feed) { + try { + itemuri = new DigestURL(item.getLink()); + languages = new HashSet(); + languages.add(item.getLanguage()); + doc = new Document( itemuri, TextParser.mimeOf(itemuri), charset, @@ -104,14 +114,40 @@ public class rssParser extends AbstractParser implements Parser { new LinkedHashMap(), false, item.getPubDate()); - docs.add(doc); - } catch (final MalformedURLException e) { - continue; - } + docs.add(doc); + } catch (final MalformedURLException e) { + continue; + } + } final Document[] da = new Document[docs.size()]; docs.toArray(da); return da; + } + + @Override + public boolean isParseWithLimitsSupported() { + return true; + } + + @Override + public Document[] parseWithLimits(final DigestURL url, final String mimeType, final String charset, final VocabularyScraper scraper, + final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) + throws Failure, InterruptedException, UnsupportedOperationException { + RSSReader rssReader; + try { + rssReader = new RSSReader(maxLinks, maxBytes, source); + } catch (final IOException e) { + throw new Parser.Failure("Load error:" + e.getMessage(), url, e); + } + + Document[] documents = rssFeedToDocuments(charset, rssReader.getFeed()); + if (documents != null && documents.length > 0 + && (rssReader.isMaxBytesExceeded() || rssReader.getFeed().isMaxSizeExceeded())) { + /* A limit has been exceeded : mark the last document as partially parsed for information of the caller */ + documents[documents.length - 1].setPartiallyParsed(true); + } + return documents; } }