From bf55f1d6e582eb126e74d18e0bea2be542efda68 Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 8 Jul 2017 09:04:03 +0200 Subject: [PATCH] Started support of partial parsing on large streamed resources. Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual. --- htroot/api/getpageinfo_p.java | 19 ++- .../crawler/retrieval/StreamResponse.java | 50 ++++++++ source/net/yacy/document/AbstractParser.java | 17 +++ source/net/yacy/document/Document.java | 18 +++ source/net/yacy/document/Parser.java | 57 ++++++++- source/net/yacy/document/TextParser.java | 117 ++++++++++++++++-- .../document/parser/GenericXMLParser.java | 85 +++++++++++-- .../document/parser/html/ContentScraper.java | 24 +++- .../parser/xml/GenericXMLContentHandler.java | 51 ++++++-- source/net/yacy/kelondro/util/FileUtils.java | 1 + .../net/yacy/repository/LoaderDispatcher.java | 67 +++++++++- .../document/parser/GenericXMLParserTest.java | 99 +++++++++++++++ .../parser/html/ContentScraperTest.java | 26 ++++ 13 files changed, 589 insertions(+), 42 deletions(-) diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index a24f3ccc5..309421a63 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -87,7 +87,8 @@ public class getpageinfo_p { * * *
  • agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"
  • - *
  • maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return
  • + *
  • maxLinks (optional integer value) : the maximum number of links, sitemap URLs or icons to return on 'title' action
  • + *
  • maxBytes (optional long integer value) : the maximum number of bytes to load and parse from the url on 'title' action
  • * * @param env * server environment @@ -139,7 +140,17 @@ public class getpageinfo_p { net.yacy.document.Document scraper = null; if (u != null) try { ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); - scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent); + + if(post.containsKey("maxBytes")) { + /* A maxBytes limit is specified : let's try to parse only the amount of bytes given */ + final long maxBytes = post.getLong("maxBytes", sb.loader.protocolMaxFileSize(u)); + scraper = sb.loader.loadDocumentAsLimitedStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent, maxLinks, maxBytes); + } else { + /* No maxBytes limit : apply regular parsing with default crawler limits. + * Eventual maxLinks limit will apply after loading and parsing the document. */ + scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent); + } + } catch (final IOException e) { ConcurrentLog.logException(e); // bad things are possible, i.e. that the Server responds with "403 Bad Behavior" @@ -151,7 +162,7 @@ public class getpageinfo_p { // put the icons that belong to the document Set iconURLs = scraper.getIcons().keySet(); - int count = 0; + long count = 0; for (DigestURL iconURL : iconURLs) { if(count >= maxLinks) { break; @@ -199,7 +210,7 @@ public class getpageinfo_p { count++; } prop.put("links", count); - prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0"); + prop.put("hasMoreLinks", scraper.isPartiallyParsed() || (count >= maxLinks && urisIt.hasNext()) ? "1" : "0"); prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : ""); prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); } diff --git a/source/net/yacy/crawler/retrieval/StreamResponse.java b/source/net/yacy/crawler/retrieval/StreamResponse.java index 19a78501b..7f99b0135 100644 --- a/source/net/yacy/crawler/retrieval/StreamResponse.java +++ b/source/net/yacy/crawler/retrieval/StreamResponse.java @@ -116,5 +116,55 @@ public class StreamResponse { } } + + /** + * Parse and close the content stream and return the parsed documents when + * possible.
    + * Try to limit the parser processing with a maximum total number of links + * detection (anchors, images links, media links...) or a maximum amount of + * content bytes to parse.
    + * Limits apply only when the available parsers for the resource media type + * support parsing within limits (see + * {@link Parser#isParseWithLimitsSupported()}. When available parsers do + * not support parsing within limits, an exception is thrown when + * content size is beyond maxBytes. + * + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return the parsed documents or null when an error occurred + * @throws Parser.Failure + * when no parser support the content, or an error occurred while parsing + */ + public Document[] parseWithLimits(final int maxLinks, final long maxBytes) throws Parser.Failure { + final String supportError = TextParser.supports(this.response.url(), + this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType()); + if (supportError != null) { + throw new Parser.Failure("no parser support:" + supportError, this.response.url()); + } + try { + final String mimeType = this.response.getResponseHeader() == null ? null + : this.response.getResponseHeader().getContentType(); + final String charsetName = this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name() + : this.response.getResponseHeader().getCharacterEncoding(); + + return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName, + this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks, + maxBytes); + } catch (final Exception e) { + return null; + } finally { + if (this.contentStream != null) { + try { + this.contentStream.close(); + } catch (IOException ignored) { + log.warn("Could not close content stream on url " + this.response.url()); + } + } + } + + } } diff --git a/source/net/yacy/document/AbstractParser.java b/source/net/yacy/document/AbstractParser.java index fc56784f1..76895027e 100644 --- a/source/net/yacy/document/AbstractParser.java +++ b/source/net/yacy/document/AbstractParser.java @@ -23,12 +23,14 @@ package net.yacy.document; +import java.io.InputStream; import java.util.ArrayList; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; import java.util.Set; +import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.util.ConcurrentLog; public abstract class AbstractParser implements Parser { @@ -98,5 +100,20 @@ public abstract class AbstractParser implements Parser { if (t != null) c.add(t); return c; } + + @Override + public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper, + int timezoneOffset, InputStream source, int maxLinks, long maxBytes) + throws Failure, InterruptedException, UnsupportedOperationException { + /* Please override on subclasses when implementation is possible */ + throw new UnsupportedOperationException(); + } + + @Override + public boolean isParseWithLimitsSupported() { + /* Please override on subclasses when parseWithLimits is supported */ + return false; + } + } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index e75e84c63..5925d3acd 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -99,6 +99,9 @@ public class Document { private final Map> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document private final Date lastModified; // creation or last modification date of the source document private int crawldepth; + + /** True when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */ + private boolean partiallyParsed; public Document(final DigestURL location, final String mimeType, final String charset, final Parser parserObject, @@ -152,6 +155,7 @@ public class Document { this.lastModified = lastModified == null ? new Date() : lastModified; this.crawldepth = 999; // unknown yet this.scraperObject = null; // will be set by setScraperObject() + this.partiallyParsed = false; } /** @@ -212,6 +216,20 @@ public class Document { return this.generic_facets; } + /** + * @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit + */ + public boolean isPartiallyParsed() { + return this.partiallyParsed; + } + + /** + * @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit + */ + public void setPartiallyParsed(final boolean partiallyParsed) { + this.partiallyParsed = partiallyParsed; + } + /** * compute a set of languages that this document contains * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document diff --git a/source/net/yacy/document/Parser.java b/source/net/yacy/document/Parser.java index cd9a352bc..e75e7f089 100644 --- a/source/net/yacy/document/Parser.java +++ b/source/net/yacy/document/Parser.java @@ -47,12 +47,13 @@ public interface Parser { * parse an input stream * @param url the url of the source * @param mimeType the mime type of the source, if known - * @param charset the charset of the source, if known + * @param charset the charset name of the source, if known * @param scraper an entity scraper to detect facets from text annotation context + * @param timezoneOffset the local time zone offset * @param source a input stream * @return a list of documents that result from parsing the source - * @throws Parser.Failure - * @throws InterruptedException + * @throws Parser.Failure when the parser processing failed + * @throws InterruptedException when the processing was interrupted before termination */ public Document[] parse( DigestURL url, @@ -62,7 +63,55 @@ public interface Parser { int timezoneOffset, InputStream source ) throws Parser.Failure, InterruptedException; - + + /** + * Parse an input stream, eventually terminating processing when a total of + * maxLinks URLS (anchors, images links, media links...) have been reached, + * or when maxBytes content bytes have been processed, thus potentially + * resulting in partially parsed documents (with + * {@link Document#isPartiallyParsed()} returning true). Some parser + * implementations will not support parsing within maxLinks or maxBytes + * limits : make sure to check this by calling fist + * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException + * could be thrown. + * + * @param url + * the URL of the source + * @param mimeType + * the mime type of the source, if known + * @param charset + * the charset name of the source, if known + * @param scraper + * an entity scraper to detect facets from text annotation + * context + * @param timezoneOffset + * the local time zone offset + * @param source + * a input stream + * @param maxLinks + * the maximum total number of links to parse and add to the + * result documents + * @param maxBytes + * the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with + * empty or null text. + * @throws Parser.Failure + * when the parser processing failed + * @throws InterruptedException + * when the processing was interrupted before termination + * @throws UnsupportedOperationException + * when the parser implementation doesn't support parsing within + * limits + */ + public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper, + int timezoneOffset, InputStream source, int maxLinks, long maxBytes) + throws Parser.Failure, InterruptedException, UnsupportedOperationException; + + /** + * @return true when the parser implementation supports the + * parseWithLimits() operation. + */ + public boolean isParseWithLimitsSupported(); // methods to that shall make it possible to put Parser objects into a hashtable diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 33acb90c7..5a1e6b898 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -34,6 +34,8 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; +import org.apache.commons.fileupload.util.LimitedInputStream; + import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; @@ -228,12 +230,12 @@ public final class TextParser { } assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true); - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE); return docs; } - - public static Document[] parseSource( + + private static Document[] parseSource( final DigestURL location, String mimeType, final String charset, @@ -241,7 +243,9 @@ public final class TextParser { final int timezoneOffset, final int depth, final long contentLength, - final InputStream sourceStream + final InputStream sourceStream, + final int maxLinks, + final long maxBytes ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); @@ -283,22 +287,79 @@ public final class TextParser { // then we use only one stream-oriented parser. if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) { // use a specific stream-oriented parser - return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream); + return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); } // in case that we know more parsers we first transform the content into a byte[] and use that as base // for a number of different parse attempts. + + /* Content length may be known from headers : check it now */ + if(contentLength >= 0 && contentLength > maxBytes) { + throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location); + } byte[] b = null; try { b = FileUtils.read(sourceStream, (int) contentLength); + + /* Check content size now if contentLength was unknown */ + if(contentLength < 0) { + if(b.length > maxBytes) { + throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location); + } + } } catch (final IOException e) { throw new Parser.Failure(e.getMessage(), location); } - Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b); + Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b, maxLinks, maxBytes); return docs; } + public static Document[] parseSource(final DigestURL location, String mimeType, final String charset, + final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength, + final InputStream sourceStream) throws Parser.Failure { + return parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, contentLength, sourceStream, + Integer.MAX_VALUE, Long.MAX_VALUE); + } + + /** + * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) + * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits + * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do + * not support parsing within limits, an exception is thrown when + * content size is beyond maxBytes. + * @param location the URL of the source + * @param mimeType the mime type of the source, if known + * @param charset the charset name of the source, if known + * @param timezoneOffset the local time zone offset + * @param contentLength the length of the source, if known (else -1 should be used) + * @param source a input stream + * @param maxLinks the maximum total number of links to parse and add to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source, with empty or null text. + * @throws Parser.Failure when the parser processing failed + */ + public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset, + final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks, + long maxBytes) throws Parser.Failure{ + return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength, + sourceStream, maxLinks, maxBytes); + } + + /** + * + * @param location the URL of the source + * @param mimeType the mime type of the source, if known + * @param parser a parser supporting the resource at location + * @param charset the charset name of the source, if known + * @param scraper a vocabulary scraper + * @param timezoneOffset the local time zone offset + * @param sourceStream an open input stream on the source + * @param maxLinks the maximum total number of links to parse and add to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source + * @throws Parser.Failure when the source could not be parsed + */ private static Document[] parseSource( final DigestURL location, final String mimeType, @@ -306,7 +367,9 @@ public final class TextParser { final String charset, final VocabularyScraper scraper, final int timezoneOffset, - final InputStream sourceStream + final InputStream sourceStream, + final int maxLinks, + final long maxBytes ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream"); final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); @@ -315,13 +378,41 @@ public final class TextParser { if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'."); try { - final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream); + final Document[] docs; + if(parser.isParseWithLimitsSupported()) { + docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes); + } else { + /* Parser do not support partial parsing within limits : let's control it here*/ + InputStream limitedSource = new LimitedInputStream(sourceStream, maxBytes) { + + @Override + protected void raiseError(long pSizeMax, long pCount) throws IOException { + throw new IOException("Reached maximum bytes to parse : " + maxBytes); + + } + }; + docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource); + } return docs; } catch (final Exception e) { throw new Parser.Failure("parser failed: " + parser.getName(), location); } } + /** + * @param location the URL of the source + * @param mimeType the mime type of the source, if known + * @param parsers a set of parsers supporting the resource at location + * @param charset the charset name of the source, if known + * @param scraper a vocabulary scraper + * @param timezoneOffset the local time zone offset + * @param depth the current crawling depth + * @param sourceArray the resource content bytes + * @param maxLinks the maximum total number of links to parse and add to the result documents + * @param maxBytes the maximum number of content bytes to process + * @return a list of documents that result from parsing the source + * @throws Parser.Failure when the source could not be parsed + */ private static Document[] parseSource( final DigestURL location, final String mimeType, @@ -330,7 +421,9 @@ public final class TextParser { final VocabularyScraper scraper, final int timezoneOffset, final int depth, - final byte[] sourceArray + final byte[] sourceArray, + final int maxLinks, + final long maxBytes ) throws Parser.Failure { final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName()); if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]"); @@ -351,7 +444,11 @@ public final class TextParser { bis = new ByteArrayInputStream(sourceArray); } try { - docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); + if(parser.isParseWithLimitsSupported()) { + docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes); + } else { + docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis); + } } catch (final Parser.Failure e) { failedParser.put(parser, e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java index 34814b749..0d37c8c1f 100644 --- a/source/net/yacy/document/parser/GenericXMLParser.java +++ b/source/net/yacy/document/parser/GenericXMLParser.java @@ -22,15 +22,20 @@ package net.yacy.document.parser; +import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.Date; +import java.util.HashSet; import java.util.List; +import java.util.Set; +import javax.naming.SizeLimitExceededException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; +import org.apache.commons.fileupload.util.LimitedInputStream; import org.apache.commons.io.input.XmlStreamReader; import org.xml.sax.InputSource; import org.xml.sax.SAXException; @@ -89,7 +94,7 @@ public class GenericXMLParser extends AbstractParser implements Parser { final VocabularyScraper scraper, final int timezoneOffset, final InputStream source) - throws Failure, InterruptedException { + throws Failure { /* Limit the size of the in-memory buffer to at most 25% of the available memory : * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. @@ -128,17 +133,81 @@ public class GenericXMLParser extends AbstractParser implements Parser { docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) }; return docs; + } catch(Parser.Failure e) { + throw e; } catch (final Exception e) { - if (e instanceof InterruptedException) { - throw (InterruptedException) e; - } - if (e instanceof Parser.Failure) { - throw (Parser.Failure) e; - } - throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location); } } + + @Override + public boolean isParseWithLimitsSupported() { + return true; + } + + /** + * {@inheritDoc} + * @param maxBytes the maximum number of content bytes to process. Be careful with to small values : + * a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration. + */ + @Override + public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper, + int timezoneOffset, InputStream source, int maxLinks, long maxBytes) + throws Failure, InterruptedException, UnsupportedOperationException { + /* Limit the size of the in-memory buffer to at most 25% of the available memory : + * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. + * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */ + final long availableMemory = MemoryControl.available(); + final long maxTextBytes = (long)(availableMemory * 0.25); + final int maxChars; + if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) { + maxChars = Integer.MAX_VALUE; + } else { + maxChars = ((int)maxTextBytes) / Character.BYTES; + } + + try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){ + + final Set detectedURLs = new HashSet<>(); + final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks); + + InputStream limitedSource = new LimitedInputStream(source, maxBytes) { + + @Override + protected void raiseError(long pSizeMax, long pCount) throws IOException { + throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes)); + + } + }; + + /* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration + * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */ + final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName); + final InputSource saxSource = new InputSource(reader); + final String detectedCharset = reader.getEncoding(); + + final SAXParser saxParser = getParser(); + boolean limitExceeded = false; + try { + saxParser.parse(saxSource, saxHandler); + } catch(SAXException | IOException e) { + if(!(e.getCause() instanceof SizeLimitExceededException)) { + /* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */ + throw e; + } + limitExceeded = true; + } + + + /* create the parsed document with empty text content */ + Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", + null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) }; + docs[0].setPartiallyParsed(limitExceeded); + return docs; + } catch (final Exception e) { + throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location); + } + } } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index e83190ae9..e52b2aa1a 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -375,14 +375,16 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s"); /** - * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null. + * Try to detect and parse absolute URLs in text (at most maxURLs) , then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. * @param text the text to parse * @param urls a mutable collection of URLs to fill. * @param listeners a collection of listeners to trigger. + * @param maxURLs maximum URLs number to add to the urls collection. Be careful with urls collection capacity when this collection is not null and maxURLs value is beyond Integer.MAX_VALUE. + * @return the number of well formed URLs detected */ - public static void findAbsoluteURLs(final String text, final Collection urls, final Collection listeners) { + public static long findAbsoluteURLs(final String text, final Collection urls, final Collection listeners, final long maxURLs) { if(text == null) { - return; + return 0; } int schemePosition, offset = 0; boolean hasWhiteSpace; @@ -391,8 +393,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { final Matcher urlSchemeMatcher = protp.matcher(text); final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text); - - while (offset < text.length()) { + long detectedURLsCount = 0; + while (offset < text.length() && detectedURLsCount < maxURLs) { if(!urlSchemeMatcher.find(offset)) { break; } @@ -413,6 +415,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { offset = schemePosition + urlString.length(); try { url = new AnchorURL(urlString); + detectedURLsCount++; if(urls != null) { urls.add(url); } @@ -423,6 +426,17 @@ public class ContentScraper extends AbstractScraper implements Scraper { } } catch (final MalformedURLException ignored) {} } + return detectedURLsCount; + } + + /** + * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. + * @param text the text to parse + * @param urls a mutable collection of URLs to fill. + * @param listeners a collection of listeners to trigger. + */ + public static void findAbsoluteURLs(final String text, final Collection urls, final Collection listeners) { + findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE); } /** diff --git a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java index 3f28d3a7c..e9154457a 100644 --- a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java +++ b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java @@ -26,6 +26,8 @@ import java.io.IOException; import java.io.Writer; import java.util.Collection; +import javax.naming.SizeLimitExceededException; + import org.apache.commons.io.input.ClosedInputStream; import org.xml.sax.Attributes; import org.xml.sax.InputSource; @@ -51,6 +53,12 @@ public class GenericXMLContentHandler extends DefaultHandler { /** Detected URLs */ private final Collection urls; + /** Maximum number of URLs to parse */ + private final int maxURLs; + + /** Number of parsed URLs in the document */ + private long detectedURLs; + /** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */ private StringBuilder currentElementText; @@ -62,7 +70,7 @@ public class GenericXMLContentHandler extends DefaultHandler { /** Set to false until some text is detected in at least one element of the document */ private boolean documentHasText; - + /** * @param out * the output writer to write extracted text. Must not be null. @@ -71,6 +79,18 @@ public class GenericXMLContentHandler extends DefaultHandler { * when out is null */ public GenericXMLContentHandler(final Writer out, final Collection urls) throws IllegalArgumentException { + this(out, urls, Integer.MAX_VALUE); + } + + /** + * @param out + * the output writer to write extracted text. Must not be null. + * @param urls the mutable collection of URLs to fill with detected URLs + * @param maxURLs the maximum number of urls to parse + * @throws IllegalArgumentException + * when out is null + */ + public GenericXMLContentHandler(final Writer out, final Collection urls, final int maxURLs) throws IllegalArgumentException { if (out == null) { throw new IllegalArgumentException("out writer must not be null"); } @@ -79,6 +99,8 @@ public class GenericXMLContentHandler extends DefaultHandler { } this.out = out; this.urls = urls; + this.maxURLs = maxURLs; + this.detectedURLs = 0; } /** @@ -96,10 +118,12 @@ public class GenericXMLContentHandler extends DefaultHandler { this.lastAppendedIsSpace = false; this.currentElementTextChunks = 0; this.documentHasText = false; + this.detectedURLs = 0; } /** * Try to detect URLs eventually contained in attributes + * @throws SAXException when the calling parser reached the maximum bytes limit on the input source */ @Override public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { @@ -109,19 +133,25 @@ public class GenericXMLContentHandler extends DefaultHandler { if (attributes != null) { for (int i = 0; i < attributes.getLength(); i++) { String attribute = attributes.getValue(i); - ContentScraper.findAbsoluteURLs(attribute, this.urls, null); + this.detectedURLs += ContentScraper.findAbsoluteURLs(attribute, this.urls, null, this.maxURLs - this.detectedURLs); + if (this.detectedURLs >= this.maxURLs) { + throw new SAXException( + new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs)); + } } } } - + /** * Write characters to the output writer + * @throws SAXException when the calling parser reached the maximum bytes limit on the input source */ @Override - public void characters(final char ch[], final int start, final int length) { + public void characters(final char ch[], final int start, final int length) throws SAXException { try { if(this.currentElementTextChunks == 0 && this.documentHasText) { - /* We are on the first text chunk of the element, or the first text chunk after processing nested elements : + /* We are but on the first text chunk of the element (not on the first text chunk of the whole document), + * or on the first text chunk after processing nested elements : * if necessary we add a space to separate text content of different elements */ if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) { this.out.write(" "); @@ -137,8 +167,8 @@ public class GenericXMLContentHandler extends DefaultHandler { this.documentHasText = true; this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]); } - } catch (final IOException e) { - ConcurrentLog.logException(e); + } catch (final IOException ignored) { + ConcurrentLog.logException(ignored); } } @@ -148,7 +178,10 @@ public class GenericXMLContentHandler extends DefaultHandler { */ @Override public void endElement(String uri, String localName, String qName) throws SAXException { - ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null); + this.detectedURLs += ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), this.urls, null, this.maxURLs - this.detectedURLs); + if (this.detectedURLs >= this.maxURLs) { + throw new SAXException(new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs)); + } this.currentElementText.setLength(0); this.currentElementTextChunks = 0; } @@ -158,5 +191,5 @@ public class GenericXMLContentHandler extends DefaultHandler { /* Release the StringBuilder now useless */ this.currentElementText = null; } - + } \ No newline at end of file diff --git a/source/net/yacy/kelondro/util/FileUtils.java b/source/net/yacy/kelondro/util/FileUtils.java index fde87e7a8..f1f892b2d 100644 --- a/source/net/yacy/kelondro/util/FileUtils.java +++ b/source/net/yacy/kelondro/util/FileUtils.java @@ -377,6 +377,7 @@ public final class FileUtils { * Read the specified amount of bytes from a source stream. * Important : it is the responsibility of the caller to close the stream. * @param source InputStream instance. Must not be null + * @param count maximum amount of bytes to read. A negative value means no limit. * @return source content as a byte array. * @throws IOException when a read/write error occurred * @throws NullPointerException when source parameter is null diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 6c0f00fd9..bc3a7d997 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -439,7 +439,11 @@ public final class LoaderDispatcher { } } - private int protocolMaxFileSize(final DigestURL url) { + /** + * @param url the URL of a resource to load + * @return the crawler configured maximum size allowed to load for the protocol of the URL + */ + public int protocolMaxFileSize(final DigestURL url) { if (url.isHTTP() || url.isHTTPS()) return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); if (url.isFTP()) @@ -583,7 +587,7 @@ public final class LoaderDispatcher { * @throws IOException when the content can not be fetched or no parser support it */ public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, - BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { // load resource Request request = request(location, true, false); final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent); @@ -611,6 +615,65 @@ public final class LoaderDispatcher { throw new IOException(e.getMessage()); } } + + /** + * Similar to the loadDocument method, but streaming the resource content + * when possible instead of fully loading it in memory.
    + * Also try to limit the parser processing with a maximum total number of + * links detection (anchors, images links, media links...) or a maximum + * amount of content bytes to parse.
    + * Limits apply only when the available parsers for the resource media type + * support parsing within limits (see + * {@link Parser#isParseWithLimitsSupported()}. When available parsers do + * not support parsing within limits, an exception is thrown when + * content size is beyond maxBytes. + * + * @param location + * URL of the resource to load + * @param cachePolicy + * cache policy strategy + * @param blacklistType + * blacklist to use + * @param agent + * user agent identifier + * @param maxLinks + * the maximum total number of links to parse and add to the + * result document + * @param maxBytes + * the maximum number of content bytes to process + * @return on parsed document or null when an error occurred while parsing + * @throws IOException + * when the content can not be fetched or no parser support it + */ + public Document loadDocumentAsLimitedStream(final DigestURL location, final CacheStrategy cachePolicy, + final BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxLinks, final long maxBytes) throws IOException { + // load resource + Request request = request(location, true, false); + final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent, -1); + final Response response = streamResponse.getResponse(); + final DigestURL url = request.url(); + if (response == null) throw new IOException("no Response for url " + url); + + // if it is still not available, report an error + if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) { + throw new IOException("no Content available for url " + url); + } + + // parse resource + try { + Document[] documents = streamResponse.parseWithLimits(maxLinks, maxBytes); + Document merged = Document.mergeDocuments(location, response.getMimeType(), documents); + + String x_robots_tag = response.getResponseHeader().getXRobotsTag(); + if (x_robots_tag.indexOf("noindex",0) >= 0) { + merged.setIndexingDenied(true); + } + + return merged; + } catch(final Parser.Failure e) { + throw new IOException(e.getMessage()); + } + } /** * load all links from a resource diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java index d475b037b..b76e4c0fd 100644 --- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java +++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java @@ -25,6 +25,7 @@ package net.yacy.document.parser; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; import java.io.ByteArrayInputStream; import java.io.File; @@ -358,5 +359,103 @@ public class GenericXMLParserTest { inStream.close(); } } + + /** + * Test URLs detection when applying limits. + * + * @throws Exception + * when an unexpected error occurred + */ + @Test + public void testParseWithLimits() throws Exception { + String xhtml = "" + + "" + + "" + "" + + "" + + "XHTML content URLs test" + "" + "" + "

    Here are some YaCy URLs: " + + "Home page : http://yacy.net - International Forum : " + + "http://forum.yacy.de " + + "and this is a mention to a relative URL : /document.html

    " + + "

    Here are YaCybug tracker and Wiki." + + "And this is a relative link to another sub document

    " + + "" + ""; + + /* Content within limits */ + InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name())); + final String contentTypeHeader = "text/xhtml"; + String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader); + DigestURL location = new DigestURL("http://localhost/testfile.xml"); + try { + Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE); + assertEquals(1, documents.length); + assertFalse(documents[0].isPartiallyParsed()); + + Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals(5, detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://mantis.tokeek.de"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.yacy-websearch.net/wiki/"))); + } finally { + inStream.close(); + } + + /* Links limit exceeded */ + inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name())); + try { + Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, + new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE); + assertEquals(1, documents.length); + assertTrue(documents[0].isPartiallyParsed()); + + Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals(2, detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net"))); + } finally { + inStream.close(); + } + + /* Bytes limit exceeded */ + StringBuilder xhtmlBuilder = new StringBuilder("") + .append("") + .append("") + .append("") + .append("") + .append("XHTML content URLs test") + .append("") + .append("

    Here are some YaCy URLs: ") + .append("Home page : http://yacy.net - International Forum : ") + .append("http://forum.yacy.de ") + .append("and this is a mention to a relative URL : /document.html

    "); + + /* Add some filler text to reach a total size beyond SAX parser internal input stream buffers */ + while(xhtmlBuilder.length() < 1024 * 10) { + xhtmlBuilder.append("

    Some text to parse

    "); + } + + int firstBytes = xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()).length; + xhtmlBuilder.append("

    Here are YaCybug tracker and Wiki.") + .append("And this is a relative link to another sub document

    ") + .append(""); + inStream = new ByteArrayInputStream(xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name())); + try { + Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, firstBytes); + assertEquals(1, documents.length); + assertTrue(documents[0].isPartiallyParsed()); + + Collection detectedAnchors = documents[0].getAnchors(); + assertNotNull(detectedAnchors); + assertEquals(3, detectedAnchors.size()); + assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net"))); + assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de"))); + } finally { + inStream.close(); + } + } } diff --git a/test/java/net/yacy/document/parser/html/ContentScraperTest.java b/test/java/net/yacy/document/parser/html/ContentScraperTest.java index a37510816..021141fe3 100644 --- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java +++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java @@ -267,6 +267,32 @@ public class ContentScraperTest { Assert.assertEquals(0, detectedURLs.size()); } + /** + * Test absolute URLs detection in plain text with maxURLs parameter + * @throws MalformedURLException should not happen + */ + @Test + public void testFindAbsoluteURLsMaxURLs() throws MalformedURLException { + final String text = "Some test URLS : http://yacy.net - http://forum.yacy.de - https://en.wikipedia.org"; + + /* No limit */ + ArrayList detectedURLs = new ArrayList<>(); + ContentScraper.findAbsoluteURLs(text, detectedURLs, null, Long.MAX_VALUE); + Assert.assertEquals(3, detectedURLs.size()); + + /* Test from zero limit, to limit value equals to the total number of URLs in text */ + for(int limit = 0; limit <=3; limit++) { + detectedURLs = new ArrayList<>(); + ContentScraper.findAbsoluteURLs(text, detectedURLs, null, limit); + Assert.assertEquals(limit, detectedURLs.size()); + } + + /* Limit greater than total number of URLs in text */ + detectedURLs = new ArrayList<>(); + ContentScraper.findAbsoluteURLs(text, detectedURLs, null, 4); + Assert.assertEquals(3, detectedURLs.size()); + } + /** * Test unpaired brackets cleaning */