Started support of partial parsing on large streamed resources.

Thus enable getpageinfo_p API to return something in a reasonable amount of time on resources over MegaBytes size range. Support added first with the generic XML parser, for other formats regular crawler limits apply as usual.
8 years ago · bf55f1d6e5
parent 2a87b08cea
commit bf55f1d6e5
13 changed files with 589 additions and 42 deletions
--- a/htroot/api/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@ -87,7 +87,8 @@ public class getpageinfo_p {
 	 *            	</ul>
 	 *            </li>
 	 *            <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
-	 *            <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
+	 *            <li>maxLinks (optional integer value) : the maximum number of links, sitemap URLs or icons to return on 'title' action</li>
+	 *            <li>maxBytes (optional long integer value) : the maximum number of bytes to load and parse from the url on 'title' action</li>
 	 *            </ul>
 	 * @param env
 	 *            server environment
@ -139,7 +140,17 @@ public class getpageinfo_p {
                net.yacy.document.Document scraper = null;
                if (u != null) try {
                    ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
-                    scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
+                	
+                	if(post.containsKey("maxBytes")) {
+                		/* A maxBytes limit is specified : let's try to parse only the amount of bytes given */
+                    	final long maxBytes = post.getLong("maxBytes", sb.loader.protocolMaxFileSize(u));
+                        scraper = sb.loader.loadDocumentAsLimitedStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent, maxLinks, maxBytes);
+                	} else {
+                		/* No maxBytes limit : apply regular parsing with default crawler limits. 
+                		 * Eventual maxLinks limit will apply after loading and parsing the document. */
+                		scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
+                	}
+
                } catch (final IOException e) {
                    ConcurrentLog.logException(e);
                    // bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -151,7 +162,7 @@ public class getpageinfo_p {

                    // put the icons that belong to the document
                    Set<DigestURL> iconURLs = scraper.getIcons().keySet();
-                    int count = 0;
+                    long count = 0;
                    for (DigestURL iconURL : iconURLs) {
                        if(count >= maxLinks) {
                        	break;
@ -199,7 +210,7 @@ public class getpageinfo_p {
                        count++;
                    }
                    prop.put("links", count);
-                   	prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
+                   	prop.put("hasMoreLinks", scraper.isPartiallyParsed() || (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
                    prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
                    prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
                }
--- a/source/net/yacy/crawler/retrieval/StreamResponse.java
+++ b/source/net/yacy/crawler/retrieval/StreamResponse.java
@ -116,5 +116,55 @@ public class StreamResponse {
 		}

 	}
+	
+	/**
+	 * Parse and close the content stream and return the parsed documents when
+	 * possible.<br>
+	 * Try to limit the parser processing with a maximum total number of links
+	 * detection (anchors, images links, media links...) or a maximum amount of
+	 * content bytes to parse.<br>
+	 * Limits apply only when the available parsers for the resource media type
+	 * support parsing within limits (see
+	 * {@link Parser#isParseWithLimitsSupported()}. When available parsers do
+	 * not support parsing within limits, an exception is thrown when
+	 * content size is beyond maxBytes.
+	 * 
+	 * @param maxLinks
+	 *            the maximum total number of links to parse and add to the
+	 *            result documents
+	 * @param maxBytes
+	 *            the maximum number of content bytes to process
+	 * @return the parsed documents or null when an error occurred
+	 * @throws Parser.Failure
+	 *             when no parser support the content, or an error occurred while parsing
+	 */
+	public Document[] parseWithLimits(final int maxLinks, final long maxBytes) throws Parser.Failure {
+		final String supportError = TextParser.supports(this.response.url(),
+				this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
+		if (supportError != null) {
+			throw new Parser.Failure("no parser support:" + supportError, this.response.url());
+		}
+		try {
+			final String mimeType = this.response.getResponseHeader() == null ? null
+					: this.response.getResponseHeader().getContentType();
+			final String charsetName = this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
+					: this.response.getResponseHeader().getCharacterEncoding();
+			
+			return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
+					this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
+					maxBytes);
+		} catch (final Exception e) {
+			return null;
+		} finally {
+			if (this.contentStream != null) {
+				try {
+					this.contentStream.close();
+				} catch (IOException ignored) {
+					log.warn("Could not close content stream on url " + this.response.url());
+				}
+			}
+		}
+
+	}

 }
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -23,12 +23,14 @@

 package net.yacy.document;

+import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Set;

+import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.ConcurrentLog;

 public abstract class AbstractParser implements Parser {
@ -98,5 +100,20 @@ public abstract class AbstractParser implements Parser {
        if (t != null) c.add(t);
        return c;
    }
+    
+    @Override
+    public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
+    		int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
+    		throws Failure, InterruptedException, UnsupportedOperationException {
+    	/* Please override on subclasses when implementation is possible */
+    	throw new UnsupportedOperationException();
+    }
+    
+    @Override
+    public boolean isParseWithLimitsSupported() {
+    	/* Please override on subclasses when parseWithLimits is supported */
+    	return false;
+    }
+    

 }
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -99,6 +99,9 @@ public class Document {
    private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
    private final Date lastModified; // creation or last modification date of the source document
    private int crawldepth;
+    
+    /** True when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */
+    private boolean partiallyParsed;

    public Document(final DigestURL location, final String mimeType, final String charset,
                    final Parser parserObject,
@ -152,6 +155,7 @@ public class Document {
        this.lastModified = lastModified == null ? new Date() : lastModified;
        this.crawldepth = 999; // unknown yet
        this.scraperObject = null; // will be set by setScraperObject()
+        this.partiallyParsed = false;
    }

    /**
@ -212,6 +216,20 @@ public class Document {
        return this.generic_facets;
    }
    
+    /**
+     * @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
+     */
+    public boolean isPartiallyParsed() {
+		return this.partiallyParsed;
+	}
+    
+    /**
+     * @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
+     */
+    public void setPartiallyParsed(final boolean partiallyParsed) {
+		this.partiallyParsed = partiallyParsed;
+	}
+    
    /**
     * compute a set of languages that this document contains
     * the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
--- a/source/net/yacy/document/Parser.java
+++ b/source/net/yacy/document/Parser.java
@ -47,12 +47,13 @@ public interface Parser {
     * parse an input stream
     * @param url the url of the source
     * @param mimeType the mime type of the source, if known
-     * @param charset the charset of the source, if known
+     * @param charset the charset name of the source, if known
     * @param scraper an entity scraper to detect facets from text annotation context
+     * @param timezoneOffset the local time zone offset
     * @param source a input stream
     * @return a list of documents that result from parsing the source
-     * @throws Parser.Failure
-     * @throws InterruptedException
+     * @throws Parser.Failure when the parser processing failed
+     * @throws InterruptedException when the processing was interrupted before termination
     */
    public Document[] parse(
            DigestURL url,
@ -62,7 +63,55 @@ public interface Parser {
            int timezoneOffset,
            InputStream source
            ) throws Parser.Failure, InterruptedException;
-
+    
+    /**
+	 * Parse an input stream, eventually terminating processing when a total of
+	 * maxLinks URLS (anchors, images links, media links...) have been reached,
+	 * or when maxBytes content bytes have been processed, thus potentially
+	 * resulting in partially parsed documents (with
+	 * {@link Document#isPartiallyParsed()} returning true). Some parser
+	 * implementations will not support parsing within maxLinks or maxBytes
+	 * limits : make sure to check this by calling fist
+	 * {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
+	 * could be thrown.
+	 * 
+	 * @param url
+	 *            the URL of the source
+	 * @param mimeType
+	 *            the mime type of the source, if known
+	 * @param charset
+	 *            the charset name of the source, if known
+	 * @param scraper
+	 *            an entity scraper to detect facets from text annotation
+	 *            context
+	 * @param timezoneOffset
+	 *            the local time zone offset
+	 * @param source
+	 *            a input stream
+	 * @param maxLinks
+	 *            the maximum total number of links to parse and add to the
+	 *            result documents
+	 * @param maxBytes
+	 *            the maximum number of content bytes to process
+	 * @return a list of documents that result from parsing the source, with
+	 *         empty or null text.
+	 * @throws Parser.Failure
+	 *             when the parser processing failed
+	 * @throws InterruptedException
+	 *             when the processing was interrupted before termination
+	 * @throws UnsupportedOperationException
+	 *             when the parser implementation doesn't support parsing within
+	 *             limits
+	 */
+	public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
+			int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
+			throws Parser.Failure, InterruptedException, UnsupportedOperationException;
+    
+	/**
+	 * @return true when the parser implementation supports the
+	 *         parseWithLimits() operation.
+	 */
+	public boolean isParseWithLimitsSupported();

    // methods to that shall make it possible to put Parser objects into a hashtable

--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -34,6 +34,8 @@ import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;

+import org.apache.commons.fileupload.util.LimitedInputStream;
+
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
@ -228,12 +230,12 @@ public final class TextParser {
        }
        assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);

-        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);

        return docs;
    }
-
-    public static Document[] parseSource(
+    
+    private static Document[] parseSource(
            final DigestURL location,
            String mimeType,
            final String charset,
@ -241,7 +243,9 @@ public final class TextParser {
            final int timezoneOffset,
            final int depth,
            final long contentLength,
-            final InputStream sourceStream
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes
        ) throws Parser.Failure {
        if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
        mimeType = normalizeMimeType(mimeType);
@ -283,22 +287,79 @@ public final class TextParser {
        // then we use only one stream-oriented parser.
        if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
            // use a specific stream-oriented parser
-            return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream);
+            return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
        }

        // in case that we know more parsers we first transform the content into a byte[] and use that as base
        // for a number of different parse attempts.
+        
+        /* Content length may be known from headers : check it now */
+        if(contentLength >= 0 && contentLength > maxBytes) {
+        	throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
+        }
        byte[] b = null;
        try {
            b = FileUtils.read(sourceStream, (int) contentLength);
+            
+            /* Check content size now if contentLength was unknown */
+            if(contentLength < 0) {
+            	if(b.length > maxBytes) {
+            		throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);		
+            	}
+            }
        } catch (final IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
-        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b);
+        Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);

        return docs;
    }

+	public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
+			final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
+			final InputStream sourceStream) throws Parser.Failure {
+		return parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, contentLength, sourceStream,
+				Integer.MAX_VALUE, Long.MAX_VALUE);
+	}
+    
+    /**
+     * Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...) 
+     * or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
+     * (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
+	 * not support parsing within limits, an exception is thrown when
+	 * content size is beyond maxBytes.
+     * @param location the URL of the source
+     * @param mimeType the mime type of the source, if known
+     * @param charset the charset name of the source, if known
+     * @param timezoneOffset the local time zone offset
+     * @param contentLength the length of the source, if known (else -1 should be used)
+     * @param source a input stream
+     * @param maxLinks the maximum total number of links to parse and add to the result documents
+     * @param maxBytes the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source, with empty or null text.
+     * @throws Parser.Failure when the parser processing failed
+     */
+	public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
+			final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
+			long maxBytes) throws Parser.Failure{
+		return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
+				sourceStream, maxLinks, maxBytes);
+	}
+    
+    /**
+     * 
+     * @param location the URL of the source
+     * @param mimeType the mime type of the source, if known
+     * @param parser a parser supporting the resource at location
+     * @param charset the charset name of the source, if known
+     * @param scraper a vocabulary scraper
+     * @param timezoneOffset the local time zone offset
+     * @param sourceStream an open input stream on the source
+     * @param maxLinks the maximum total number of links to parse and add to the result documents
+     * @param maxBytes the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source
+     * @throws Parser.Failure when the source could not be parsed
+     */
    private static Document[] parseSource(
            final DigestURL location,
            final String mimeType,
@ -306,7 +367,9 @@ public final class TextParser {
            final String charset,
            final VocabularyScraper scraper,
            final int timezoneOffset,
-            final InputStream sourceStream
+            final InputStream sourceStream,
+            final int maxLinks,
+            final long maxBytes
        ) throws Parser.Failure {
        if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
        final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
@ -315,13 +378,41 @@ public final class TextParser {

        if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
        try {
-            final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream);
+            final Document[] docs;
+            if(parser.isParseWithLimitsSupported()) {
+            	docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
+            } else {
+            	/* Parser do not support partial parsing within limits : let's control it here*/
+    			InputStream limitedSource = new LimitedInputStream(sourceStream, maxBytes) {
+    				
+    				@Override
+    				protected void raiseError(long pSizeMax, long pCount) throws IOException {
+    					throw new IOException("Reached maximum bytes to parse : " + maxBytes);
+    					
+    				}
+    			};
+            	docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
+            }
            return docs;
        } catch (final Exception e) {
            throw new Parser.Failure("parser failed: " + parser.getName(), location);
        }
    }

+    /**
+     * @param location the URL of the source
+     * @param mimeType the mime type of the source, if known
+     * @param parsers a set of parsers supporting the resource at location
+     * @param charset the charset name of the source, if known
+     * @param scraper a vocabulary scraper
+     * @param timezoneOffset the local time zone offset
+     * @param depth the current crawling depth
+     * @param sourceArray the resource content bytes
+     * @param maxLinks the maximum total number of links to parse and add to the result documents
+     * @param maxBytes the maximum number of content bytes to process
+     * @return a list of documents that result from parsing the source
+     * @throws Parser.Failure when the source could not be parsed
+     */
    private static Document[] parseSource(
            final DigestURL location,
            final String mimeType,
@ -330,7 +421,9 @@ public final class TextParser {
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final int depth,
-            final byte[] sourceArray
+            final byte[] sourceArray,
+            final int maxLinks,
+            final long maxBytes
        ) throws Parser.Failure {
        final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
        if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
@ -351,7 +444,11 @@ public final class TextParser {
            	    bis = new ByteArrayInputStream(sourceArray);
            	}
                try {
-                    docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
+                	if(parser.isParseWithLimitsSupported()) {
+                		docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes);
+                	} else {
+                		docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
+                	}
                } catch (final Parser.Failure e) {
                    failedParser.put(parser, e);
                    //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
--- a/source/net/yacy/document/parser/GenericXMLParser.java
+++ b/source/net/yacy/document/parser/GenericXMLParser.java
@ -22,15 +22,20 @@

 package net.yacy.document.parser;

+import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;

+import javax.naming.SizeLimitExceededException;
 import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

+import org.apache.commons.fileupload.util.LimitedInputStream;
 import org.apache.commons.io.input.XmlStreamReader;
 import org.xml.sax.InputSource;
 import org.xml.sax.SAXException;
@ -89,7 +94,7 @@ public class GenericXMLParser extends AbstractParser implements Parser {
            final VocabularyScraper scraper, 
            final int timezoneOffset,
            final InputStream source)
-            throws Failure, InterruptedException {
+            throws Failure {
    	
    	/* Limit the size of the in-memory buffer to at most 25% of the available memory :
    	 * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
@ -128,17 +133,81 @@ public class GenericXMLParser extends AbstractParser implements Parser {
 			docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
 					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
 			return docs;
+		} catch(Parser.Failure e) {
+			throw e;
 		} catch (final Exception e) {
-			if (e instanceof InterruptedException) {
-				throw (InterruptedException) e;
-			}
-			if (e instanceof Parser.Failure) {
-				throw (Parser.Failure) e;
-			}
-
 			throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
 		}

 	}
+    
+    @Override
+    public boolean isParseWithLimitsSupported() {
+    	return true;
+    }
+    
+    /**
+     * {@inheritDoc}
+     * @param maxBytes the maximum number of content bytes to process. Be careful with to small values : 
+     * 	a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.
+     */
+    @Override
+    public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper,
+    		int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
+    		throws Failure, InterruptedException, UnsupportedOperationException {
+    	/* Limit the size of the in-memory buffer to at most 25% of the available memory :
+    	 * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
+    	 * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
+    	final long availableMemory = MemoryControl.available();
+    	final long maxTextBytes = (long)(availableMemory * 0.25);
+    	final int maxChars;
+    	if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {
+    		maxChars = Integer.MAX_VALUE;
+    	} else {
+    		maxChars = ((int)maxTextBytes) / Character.BYTES;
+    	}
+    	
+        try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
+
+			final Set<AnchorURL> detectedURLs = new HashSet<>();
+			final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);
+			
+			InputStream limitedSource = new LimitedInputStream(source, maxBytes) {
+				
+				@Override
+				protected void raiseError(long pSizeMax, long pCount) throws IOException {
+					throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes));
+					
+				}
+			};
+        	
+        	/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
+        	 * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
+        	final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);
+			final InputSource saxSource = new InputSource(reader);
+			final String detectedCharset = reader.getEncoding();
+
+			final SAXParser saxParser = getParser();
+			boolean limitExceeded = false;
+			try {
+				saxParser.parse(saxSource, saxHandler);
+			} catch(SAXException | IOException e) {
+				if(!(e.getCause() instanceof SizeLimitExceededException)) {
+					/* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */
+					throw e;
+				}
+				limitExceeded = true;
+			}
+
+
+			/* create the parsed document with empty text content */
+			Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
+					null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
+			docs[0].setPartiallyParsed(limitExceeded);
+			return docs;
+		} catch (final Exception e) {
+			throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
+		}
+    }

 }
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -375,14 +375,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
    
    /**
-     * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null. 
+     * Try to detect and parse absolute URLs in text (at most maxURLs) , then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. 
     * @param text the text to parse
     * @param urls a mutable collection of URLs to fill.
     * @param listeners a collection of listeners to trigger.
+     * @param maxURLs maximum URLs number to add to the urls collection. Be careful with urls collection capacity when this collection is not null and maxURLs value is beyond Integer.MAX_VALUE.
+     * @return the number of well formed URLs detected
     */
-    public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
+    public static long findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners, final long maxURLs) {
        if(text == null) {
-        	return;
+        	return 0;
        }
        int schemePosition, offset = 0;
        boolean hasWhiteSpace;
@ -391,8 +393,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        final Matcher urlSchemeMatcher = protp.matcher(text);
        final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
        
-        
-        while (offset < text.length()) {
+        long detectedURLsCount = 0;
+        while (offset < text.length() && detectedURLsCount < maxURLs) {
            if(!urlSchemeMatcher.find(offset)) {
            	break;
            }
@ -413,6 +415,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            offset = schemePosition + urlString.length();
            try {
            	url = new AnchorURL(urlString);
+            	detectedURLsCount++;
            	if(urls != null) {
            		urls.add(url);
            	}
@ -423,6 +426,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            	}
            } catch (final MalformedURLException ignored) {}
        }
+        return detectedURLsCount;
+    }
+    
+    /**
+     * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null. 
+     * @param text the text to parse
+     * @param urls a mutable collection of URLs to fill.
+     * @param listeners a collection of listeners to trigger.
+     */
+    public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
+    	findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
    }

 	/**
--- a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java
+++ b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java
@ -26,6 +26,8 @@ import java.io.IOException;
 import java.io.Writer;
 import java.util.Collection;

+import javax.naming.SizeLimitExceededException;
+
 import org.apache.commons.io.input.ClosedInputStream;
 import org.xml.sax.Attributes;
 import org.xml.sax.InputSource;
@ -51,6 +53,12 @@ public class GenericXMLContentHandler extends DefaultHandler {
 	/** Detected URLs */
 	private final Collection<AnchorURL> urls;
 	
+	/** Maximum number of URLs to parse */
+	private final int maxURLs;
+	
+	/** Number of parsed URLs in the document */
+	private long detectedURLs;
+	
 	/** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */
 	private StringBuilder currentElementText;
 	
@ -62,7 +70,7 @@ public class GenericXMLContentHandler extends DefaultHandler {
 	
 	/** Set to false until some text is detected in at least one element of the document */
 	private boolean documentHasText;
-
+	
 	/**
 	 * @param out
 	 *            the output writer to write extracted text. Must not be null.
@ -71,6 +79,18 @@ public class GenericXMLContentHandler extends DefaultHandler {
 	 *             when out is null
 	 */
 	public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls) throws IllegalArgumentException {
+		this(out, urls, Integer.MAX_VALUE);
+	}
+	
+	/**
+	 * @param out
+	 *            the output writer to write extracted text. Must not be null.
+	 * @param urls the mutable collection of URLs to fill with detected URLs
+	 * @param maxURLs the maximum number of urls to parse
+	 * @throws IllegalArgumentException
+	 *             when out is null
+	 */
+	public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls, final int maxURLs) throws IllegalArgumentException {
 		if (out == null) {
 			throw new IllegalArgumentException("out writer must not be null");
 		}
@ -79,6 +99,8 @@ public class GenericXMLContentHandler extends DefaultHandler {
 		}
 		this.out = out;
 		this.urls = urls;
+		this.maxURLs = maxURLs;
+		this.detectedURLs = 0;
 	}

 	/**
@ -96,10 +118,12 @@ public class GenericXMLContentHandler extends DefaultHandler {
 		this.lastAppendedIsSpace = false;
 		this.currentElementTextChunks = 0;
 		this.documentHasText = false;
+		this.detectedURLs = 0;
 	}

 	/**
 	 * Try to detect URLs eventually contained in attributes
+	 * @throws SAXException when the calling parser reached the maximum bytes limit on the input source
 	 */
 	@Override
 	public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
@ -109,19 +133,25 @@ public class GenericXMLContentHandler extends DefaultHandler {
 		if (attributes != null) {
 			for (int i = 0; i < attributes.getLength(); i++) {
 				String attribute = attributes.getValue(i);
-				ContentScraper.findAbsoluteURLs(attribute, this.urls, null);
+				this.detectedURLs += ContentScraper.findAbsoluteURLs(attribute, this.urls, null, this.maxURLs - this.detectedURLs);
+				if (this.detectedURLs >= this.maxURLs) {
+					throw new SAXException(
+							new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs));
+				}
 			}
 		}
 	}
-
+	
 	/**
 	 * Write characters to the output writer
+	 * @throws SAXException when the calling parser reached the maximum bytes limit on the input source
 	 */
 	@Override
-	public void characters(final char ch[], final int start, final int length) {
+	public void characters(final char ch[], final int start, final int length) throws SAXException {
 		try {
 			if(this.currentElementTextChunks == 0 && this.documentHasText) {
-				/* We are on the first text chunk of the element, or the first text chunk after processing nested elements : 
+				/* We are but on the first text chunk of the element (not on the first text chunk of the whole document), 
+				 * or on the first text chunk after processing nested elements : 
 				 * if necessary we add a space to separate text content of different elements */
 				if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) {
 					this.out.write(" ");
@ -137,8 +167,8 @@ public class GenericXMLContentHandler extends DefaultHandler {
 				this.documentHasText = true;
 				this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]);
 			}
-		} catch (final IOException e) {
-			ConcurrentLog.logException(e);
+		} catch (final IOException ignored) {
+			ConcurrentLog.logException(ignored);
 		}
 	}

@ -148,7 +178,10 @@ public class GenericXMLContentHandler extends DefaultHandler {
 	 */
 	@Override
 	public void endElement(String uri, String localName, String qName) throws SAXException {
-		ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null);
+		this.detectedURLs += ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), this.urls, null, this.maxURLs - this.detectedURLs);
+		if (this.detectedURLs >= this.maxURLs) {
+			throw new SAXException(new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs));
+		}
 		this.currentElementText.setLength(0);
 		this.currentElementTextChunks = 0;
 	}
@ -158,5 +191,5 @@ public class GenericXMLContentHandler extends DefaultHandler {
 		/* Release the StringBuilder now useless */
 		this.currentElementText = null;
 	}
-
+	
 }
--- a/source/net/yacy/kelondro/util/FileUtils.java
+++ b/source/net/yacy/kelondro/util/FileUtils.java
@ -377,6 +377,7 @@ public final class FileUtils {
     * Read the specified amount of bytes from a source stream.
     * Important : it is the responsibility of the caller to close the stream.
     * @param source InputStream instance. Must not be null
+     * @param count maximum amount of bytes to read. A negative value means no limit.
     * @return source content as a byte array.
     * @throws IOException when a read/write error occurred
     * @throws NullPointerException when source parameter is null
--- a/source/net/yacy/repository/LoaderDispatcher.java
+++ b/source/net/yacy/repository/LoaderDispatcher.java
@ -439,7 +439,11 @@ public final class LoaderDispatcher {
 		}
 	}

-    private int protocolMaxFileSize(final DigestURL url) {
+	/**
+	 * @param url the URL of a resource to load
+	 * @return the crawler configured maximum size allowed to load for the protocol of the URL 
+	 */
+    public int protocolMaxFileSize(final DigestURL url) {
    	if (url.isHTTP() || url.isHTTPS())
    		return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
    	if (url.isFTP())
@ -583,7 +587,7 @@ public final class LoaderDispatcher {
     * @throws IOException when the content can not be fetched or no parser support it
     */
    public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy, 
-    		BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
+    		final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
        // load resource
        Request request = request(location, true, false);
        final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
@ -611,6 +615,65 @@ public final class LoaderDispatcher {
            throw new IOException(e.getMessage());
        }
    }
+    
+    /**
+	 * Similar to the loadDocument method, but streaming the resource content
+	 * when possible instead of fully loading it in memory.<br>
+	 * Also try to limit the parser processing with a maximum total number of
+	 * links detection (anchors, images links, media links...) or a maximum
+	 * amount of content bytes to parse.<br>
+	 * Limits apply only when the available parsers for the resource media type
+	 * support parsing within limits (see
+	 * {@link Parser#isParseWithLimitsSupported()}. When available parsers do
+	 * not support parsing within limits, an exception is thrown when
+	 * content size is beyond maxBytes.
+	 * 
+	 * @param location
+	 *            URL of the resource to load
+	 * @param cachePolicy
+	 *            cache policy strategy
+	 * @param blacklistType
+	 *            blacklist to use
+	 * @param agent
+	 *            user agent identifier
+	 * @param maxLinks
+	 *            the maximum total number of links to parse and add to the
+	 *            result document
+	 * @param maxBytes
+	 *            the maximum number of content bytes to process
+	 * @return on parsed document or null when an error occurred while parsing
+	 * @throws IOException
+	 *             when the content can not be fetched or no parser support it
+	 */
+    public Document loadDocumentAsLimitedStream(final DigestURL location, final CacheStrategy cachePolicy, 
+    		final BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxLinks, final long maxBytes) throws IOException {
+        // load resource
+        Request request = request(location, true, false);
+        final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent, -1);
+        final Response response = streamResponse.getResponse();
+        final DigestURL url = request.url();
+        if (response == null) throw new IOException("no Response for url " + url);
+
+        // if it is still not available, report an error
+        if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) {
+        	throw new IOException("no Content available for url " + url);
+        }
+
+        // parse resource
+        try {
+            Document[] documents = streamResponse.parseWithLimits(maxLinks, maxBytes);
+            Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
+            
+            String x_robots_tag = response.getResponseHeader().getXRobotsTag();
+            if (x_robots_tag.indexOf("noindex",0) >= 0) {
+            	merged.setIndexingDenied(true);
+            }
+            
+            return merged;
+        } catch(final Parser.Failure e) {
+            throw new IOException(e.getMessage());
+        }
+    }

    /**
     * load all links from a resource
--- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java
+++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
@ -25,6 +25,7 @@ package net.yacy.document.parser;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertFalse;

 import java.io.ByteArrayInputStream;
 import java.io.File;
@ -358,5 +359,103 @@ public class GenericXMLParserTest {
 			inStream.close();
 		}		
 	}
+	
+	/**
+	 * Test URLs detection when applying limits.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseWithLimits() throws Exception {
+		String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+				+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+				+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+				+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+				+ "<title>XHTML content URLs test</title>" + "</head>" + "<body>" + "<p>Here are some YaCy URLs: "
+				+ "Home page : http://yacy.net - International Forum : "
+				+ "http://forum.yacy.de "
+				+ "and this is a mention to a relative URL : /document.html</p>"
+				+ "<p>Here are YaCy<a href=\"http://mantis.tokeek.de\">bug tracker</a> and <a href=\"http://www.yacy-websearch.net/wiki/\">Wiki</a>."
+				+ "And this is a relative link to another <a href=\"/document2.html\">sub document</a></p>"
+				+ "</body>" + "</html>";
+
+		/* Content within limits */
+		InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
+		final String contentTypeHeader = "text/xhtml";
+		String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
+		DigestURL location = new DigestURL("http://localhost/testfile.xml");
+		try {
+			Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE);
+			assertEquals(1, documents.length);
+			assertFalse(documents[0].isPartiallyParsed());
+			
+			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals(5, detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://mantis.tokeek.de")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.yacy-websearch.net/wiki/")));
+		} finally {
+			inStream.close();
+		}
+		
+		/* Links limit exceeded */
+		inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
+		try {
+			Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader,
+					new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE);
+			assertEquals(1, documents.length);
+			assertTrue(documents[0].isPartiallyParsed());
+			
+			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals(2, detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
+		} finally {
+			inStream.close();
+		}
+		
+		/* Bytes limit exceeded */
+		StringBuilder xhtmlBuilder = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")
+				.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">")
+				.append("<html xmlns=\"http://www.w3.org/1999/xhtml\">")
+				.append("<head>")
+				.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />")
+				.append("<title>XHTML content URLs test</title>")
+				.append("</head>")
+				.append("<body><p>Here are some YaCy URLs: ")
+				.append("Home page : http://yacy.net - International Forum : ")
+				.append("http://forum.yacy.de ")
+				.append("and this is a mention to a relative URL : /document.html</p>");
+		
+		/* Add some filler text to reach a total size beyond SAX parser internal input stream buffers */
+		while(xhtmlBuilder.length() < 1024 * 10) {
+			xhtmlBuilder.append("<p>Some text to parse</p>");
+		}
+		
+		int firstBytes = xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()).length;
+		xhtmlBuilder.append("<p>Here are YaCy<a href=\"http://mantis.tokeek.de\">bug tracker</a> and <a href=\"http://www.yacy-websearch.net/wiki/\">Wiki</a>.")
+			.append("And this is a relative link to another <a href=\"/document2.html\">sub document</a></p>")
+			.append("</body></html>");
+		inStream = new ByteArrayInputStream(xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()));
+		try {
+			Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, firstBytes);
+			assertEquals(1, documents.length);
+			assertTrue(documents[0].isPartiallyParsed());
+			
+			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals(3, detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
+		} finally {
+			inStream.close();
+		}
+	}

 }
--- a/test/java/net/yacy/document/parser/html/ContentScraperTest.java
+++ b/test/java/net/yacy/document/parser/html/ContentScraperTest.java
@ -267,6 +267,32 @@ public class ContentScraperTest {
 		Assert.assertEquals(0, detectedURLs.size());
    }
    
+    /**
+     * Test absolute URLs detection in plain text with maxURLs parameter
+     * @throws MalformedURLException should not happen
+     */
+    @Test
+    public void testFindAbsoluteURLsMaxURLs() throws MalformedURLException {
+    	final String text = "Some test URLS : http://yacy.net - http://forum.yacy.de - https://en.wikipedia.org";
+    	
+    	/* No limit */
+    	ArrayList<AnchorURL> detectedURLs = new ArrayList<>();
+    	ContentScraper.findAbsoluteURLs(text, detectedURLs, null, Long.MAX_VALUE);
+    	Assert.assertEquals(3, detectedURLs.size());
+    	
+    	/* Test from zero limit, to limit value equals to the total number of URLs in text */
+    	for(int limit = 0; limit <=3; limit++) {
+    		detectedURLs = new ArrayList<>();
+    		ContentScraper.findAbsoluteURLs(text, detectedURLs, null, limit);
+    		Assert.assertEquals(limit, detectedURLs.size());
+    	}
+    	
+    	/* Limit greater than total number of URLs in text */
+    	detectedURLs = new ArrayList<>();
+    	ContentScraper.findAbsoluteURLs(text, detectedURLs, null, 4);
+    	Assert.assertEquals(3, detectedURLs.size());
+    }
+    
    /**
     * Test unpaired brackets cleaning
     */