Started support of partial parsing on large streamed resources.

Thus enable getpageinfo_p API to return something in a reasonable amount
of time on resources over MegaBytes size range.
Support added first with the generic XML parser, for other formats
regular crawler limits apply as usual.
pull/127/head
luccioman 8 years ago
parent 2a87b08cea
commit bf55f1d6e5

@ -87,7 +87,8 @@ public class getpageinfo_p {
* </ul>
* </li>
* <li>agentName (optional) : the string identifying the agent used to fetch the resource. Example : "YaCy Internet (cautious)"</li>
* <li>maxLinks (optional) : the maximum number of links, sitemap URLs or icons to return</li>
* <li>maxLinks (optional integer value) : the maximum number of links, sitemap URLs or icons to return on 'title' action</li>
* <li>maxBytes (optional long integer value) : the maximum number of bytes to load and parse from the url on 'title' action</li>
* </ul>
* @param env
* server environment
@ -139,7 +140,17 @@ public class getpageinfo_p {
net.yacy.document.Document scraper = null;
if (u != null) try {
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
if(post.containsKey("maxBytes")) {
/* A maxBytes limit is specified : let's try to parse only the amount of bytes given */
final long maxBytes = post.getLong("maxBytes", sb.loader.protocolMaxFileSize(u));
scraper = sb.loader.loadDocumentAsLimitedStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent, maxLinks, maxBytes);
} else {
/* No maxBytes limit : apply regular parsing with default crawler limits.
* Eventual maxLinks limit will apply after loading and parsing the document. */
scraper = sb.loader.loadDocumentAsStream(u, CacheStrategy.IFEXIST, BlacklistType.CRAWLER, agent);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -151,7 +162,7 @@ public class getpageinfo_p {
// put the icons that belong to the document
Set<DigestURL> iconURLs = scraper.getIcons().keySet();
int count = 0;
long count = 0;
for (DigestURL iconURL : iconURLs) {
if(count >= maxLinks) {
break;
@ -199,7 +210,7 @@ public class getpageinfo_p {
count++;
}
prop.put("links", count);
prop.put("hasMoreLinks", (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
prop.put("hasMoreLinks", scraper.isPartiallyParsed() || (count >= maxLinks && urisIt.hasNext()) ? "1" : "0");
prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}

@ -116,5 +116,55 @@ public class StreamResponse {
}
}
/**
* Parse and close the content stream and return the parsed documents when
* possible.<br>
* Try to limit the parser processing with a maximum total number of links
* detection (anchors, images links, media links...) or a maximum amount of
* content bytes to parse.<br>
* Limits apply only when the available parsers for the resource media type
* support parsing within limits (see
* {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
*
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return the parsed documents or null when an error occurred
* @throws Parser.Failure
* when no parser support the content, or an error occurred while parsing
*/
public Document[] parseWithLimits(final int maxLinks, final long maxBytes) throws Parser.Failure {
final String supportError = TextParser.supports(this.response.url(),
this.response.getResponseHeader() == null ? null : this.response.getResponseHeader().getContentType());
if (supportError != null) {
throw new Parser.Failure("no parser support:" + supportError, this.response.url());
}
try {
final String mimeType = this.response.getResponseHeader() == null ? null
: this.response.getResponseHeader().getContentType();
final String charsetName = this.response.getResponseHeader() == null ? StandardCharsets.UTF_8.name()
: this.response.getResponseHeader().getCharacterEncoding();
return TextParser.parseWithLimits(this.response.url(), mimeType, charsetName,
this.response.getRequest().timezoneOffset(), this.response.size(), this.contentStream, maxLinks,
maxBytes);
} catch (final Exception e) {
return null;
} finally {
if (this.contentStream != null) {
try {
this.contentStream.close();
} catch (IOException ignored) {
log.warn("Could not close content stream on url " + this.response.url());
}
}
}
}
}

@ -23,12 +23,14 @@
package net.yacy.document;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
public abstract class AbstractParser implements Parser {
@ -98,5 +100,20 @@ public abstract class AbstractParser implements Parser {
if (t != null) c.add(t);
return c;
}
@Override
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
/* Please override on subclasses when implementation is possible */
throw new UnsupportedOperationException();
}
@Override
public boolean isParseWithLimitsSupported() {
/* Please override on subclasses when parseWithLimits is supported */
return false;
}
}

@ -99,6 +99,9 @@ public class Document {
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified; // creation or last modification date of the source document
private int crawldepth;
/** True when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit */
private boolean partiallyParsed;
public Document(final DigestURL location, final String mimeType, final String charset,
final Parser parserObject,
@ -152,6 +155,7 @@ public class Document {
this.lastModified = lastModified == null ? new Date() : lastModified;
this.crawldepth = 999; // unknown yet
this.scraperObject = null; // will be set by setScraperObject()
this.partiallyParsed = false;
}
/**
@ -212,6 +216,20 @@ public class Document {
return this.generic_facets;
}
/**
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public boolean isPartiallyParsed() {
return this.partiallyParsed;
}
/**
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public void setPartiallyParsed(final boolean partiallyParsed) {
this.partiallyParsed = partiallyParsed;
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document

@ -47,12 +47,13 @@ public interface Parser {
* parse an input stream
* @param url the url of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset of the source, if known
* @param charset the charset name of the source, if known
* @param scraper an entity scraper to detect facets from text annotation context
* @param timezoneOffset the local time zone offset
* @param source a input stream
* @return a list of documents that result from parsing the source
* @throws Parser.Failure
* @throws InterruptedException
* @throws Parser.Failure when the parser processing failed
* @throws InterruptedException when the processing was interrupted before termination
*/
public Document[] parse(
DigestURL url,
@ -62,7 +63,55 @@ public interface Parser {
int timezoneOffset,
InputStream source
) throws Parser.Failure, InterruptedException;
/**
* Parse an input stream, eventually terminating processing when a total of
* maxLinks URLS (anchors, images links, media links...) have been reached,
* or when maxBytes content bytes have been processed, thus potentially
* resulting in partially parsed documents (with
* {@link Document#isPartiallyParsed()} returning true). Some parser
* implementations will not support parsing within maxLinks or maxBytes
* limits : make sure to check this by calling fist
* {@link #isParseWithLimitsSupported()}, or a UnsupportedOperationException
* could be thrown.
*
* @param url
* the URL of the source
* @param mimeType
* the mime type of the source, if known
* @param charset
* the charset name of the source, if known
* @param scraper
* an entity scraper to detect facets from text annotation
* context
* @param timezoneOffset
* the local time zone offset
* @param source
* a input stream
* @param maxLinks
* the maximum total number of links to parse and add to the
* result documents
* @param maxBytes
* the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with
* empty or null text.
* @throws Parser.Failure
* when the parser processing failed
* @throws InterruptedException
* when the processing was interrupted before termination
* @throws UnsupportedOperationException
* when the parser implementation doesn't support parsing within
* limits
*/
public Document[] parseWithLimits(DigestURL url, String mimeType, String charset, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Parser.Failure, InterruptedException, UnsupportedOperationException;
/**
* @return true when the parser implementation supports the
* parseWithLimits() operation.
*/
public boolean isParseWithLimitsSupported();
// methods to that shall make it possible to put Parser objects into a hashtable

@ -34,6 +34,8 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.commons.fileupload.util.LimitedInputStream;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
@ -228,12 +230,12 @@ public final class TextParser {
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, content, Integer.MAX_VALUE, Long.MAX_VALUE);
return docs;
}
public static Document[] parseSource(
private static Document[] parseSource(
final DigestURL location,
String mimeType,
final String charset,
@ -241,7 +243,9 @@ public final class TextParser {
final int timezoneOffset,
final int depth,
final long contentLength,
final InputStream sourceStream
final InputStream sourceStream,
final int maxLinks,
final long maxBytes
) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
@ -283,22 +287,79 @@ public final class TextParser {
// then we use only one stream-oriented parser.
if (canStream || contentLength > Integer.MAX_VALUE || contentLength > MemoryControl.available()) {
// use a specific stream-oriented parser
return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream);
return parseSource(location, mimeType, streamParser, charset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base
// for a number of different parse attempts.
/* Content length may be known from headers : check it now */
if(contentLength >= 0 && contentLength > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
}
byte[] b = null;
try {
b = FileUtils.read(sourceStream, (int) contentLength);
/* Check content size now if contentLength was unknown */
if(contentLength < 0) {
if(b.length > maxBytes) {
throw new Parser.Failure("Content size is over maximum size of " + maxBytes + "", location);
}
}
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b);
Document[] docs = parseSource(location, mimeType, idioms, charset, scraper, timezoneOffset, depth, b, maxLinks, maxBytes);
return docs;
}
public static Document[] parseSource(final DigestURL location, String mimeType, final String charset,
final VocabularyScraper scraper, final int timezoneOffset, final int depth, final long contentLength,
final InputStream sourceStream) throws Parser.Failure {
return parseSource(location, mimeType, charset, scraper, timezoneOffset, depth, contentLength, sourceStream,
Integer.MAX_VALUE, Long.MAX_VALUE);
}
/**
* Try to limit the parser processing with a maximum total number of links detection (anchors, images links, media links...)
* or a maximum amount of content bytes to parse. Limits apply only when the available parsers for the resource media type support parsing within limits
* (see {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
* @param location the URL of the source
* @param mimeType the mime type of the source, if known
* @param charset the charset name of the source, if known
* @param timezoneOffset the local time zone offset
* @param contentLength the length of the source, if known (else -1 should be used)
* @param source a input stream
* @param maxLinks the maximum total number of links to parse and add to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source, with empty or null text.
* @throws Parser.Failure when the parser processing failed
*/
public static Document[] parseWithLimits(final DigestURL location, String mimeType, final String charset,
final int timezoneOffset, final long contentLength, final InputStream sourceStream, int maxLinks,
long maxBytes) throws Parser.Failure{
return parseSource(location, mimeType, charset, new VocabularyScraper(), timezoneOffset, 0, contentLength,
sourceStream, maxLinks, maxBytes);
}
/**
*
* @param location the URL of the source
* @param mimeType the mime type of the source, if known
* @param parser a parser supporting the resource at location
* @param charset the charset name of the source, if known
* @param scraper a vocabulary scraper
* @param timezoneOffset the local time zone offset
* @param sourceStream an open input stream on the source
* @param maxLinks the maximum total number of links to parse and add to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source
* @throws Parser.Failure when the source could not be parsed
*/
private static Document[] parseSource(
final DigestURL location,
final String mimeType,
@ -306,7 +367,9 @@ public final class TextParser {
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream sourceStream
final InputStream sourceStream,
final int maxLinks,
final long maxBytes
) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing '" + location + "' from stream");
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
@ -315,13 +378,41 @@ public final class TextParser {
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try {
final Document[] docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream);
final Document[] docs;
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, sourceStream, maxLinks, maxBytes);
} else {
/* Parser do not support partial parsing within limits : let's control it here*/
InputStream limitedSource = new LimitedInputStream(sourceStream, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException("Reached maximum bytes to parse : " + maxBytes);
}
};
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, limitedSource);
}
return docs;
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
}
}
/**
* @param location the URL of the source
* @param mimeType the mime type of the source, if known
* @param parsers a set of parsers supporting the resource at location
* @param charset the charset name of the source, if known
* @param scraper a vocabulary scraper
* @param timezoneOffset the local time zone offset
* @param depth the current crawling depth
* @param sourceArray the resource content bytes
* @param maxLinks the maximum total number of links to parse and add to the result documents
* @param maxBytes the maximum number of content bytes to process
* @return a list of documents that result from parsing the source
* @throws Parser.Failure when the source could not be parsed
*/
private static Document[] parseSource(
final DigestURL location,
final String mimeType,
@ -330,7 +421,9 @@ public final class TextParser {
final VocabularyScraper scraper,
final int timezoneOffset,
final int depth,
final byte[] sourceArray
final byte[] sourceArray,
final int maxLinks,
final long maxBytes
) throws Parser.Failure {
final String fileExt = MultiProtocolURL.getFileExtension(location.getFileName());
if (AbstractParser.log.isFine()) AbstractParser.log.fine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
@ -351,7 +444,11 @@ public final class TextParser {
bis = new ByteArrayInputStream(sourceArray);
}
try {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
if(parser.isParseWithLimitsSupported()) {
docs = parser.parseWithLimits(location, mimeType, documentCharset, scraper, timezoneOffset, bis, maxLinks, maxBytes);
} else {
docs = parser.parse(location, mimeType, documentCharset, scraper, timezoneOffset, bis);
}
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);

@ -22,15 +22,20 @@
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.naming.SizeLimitExceededException;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.fileupload.util.LimitedInputStream;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
@ -89,7 +94,7 @@ public class GenericXMLParser extends AbstractParser implements Parser {
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
throws Failure {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
@ -128,17 +133,81 @@ public class GenericXMLParser extends AbstractParser implements Parser {
docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
return docs;
} catch(Parser.Failure e) {
throw e;
} catch (final Exception e) {
if (e instanceof InterruptedException) {
throw (InterruptedException) e;
}
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
}
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
}
}
@Override
public boolean isParseWithLimitsSupported() {
return true;
}
/**
* {@inheritDoc}
* @param maxBytes the maximum number of content bytes to process. Be careful with to small values :
* a Failure exception can eventually be thrown when maxBytes value is so small that the parser can even not fill its buffers on input stream and parse the document declaration.
*/
@Override
public Document[] parseWithLimits(DigestURL location, String mimeType, String charsetName, VocabularyScraper scraper,
int timezoneOffset, InputStream source, int maxLinks, long maxBytes)
throws Failure, InterruptedException, UnsupportedOperationException {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
final long availableMemory = MemoryControl.available();
final long maxTextBytes = (long)(availableMemory * 0.25);
final int maxChars;
if((maxTextBytes / Character.BYTES) > Integer.MAX_VALUE) {
maxChars = Integer.MAX_VALUE;
} else {
maxChars = ((int)maxTextBytes) / Character.BYTES;
}
try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
final Set<AnchorURL> detectedURLs = new HashSet<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs, maxLinks);
InputStream limitedSource = new LimitedInputStream(source, maxBytes) {
@Override
protected void raiseError(long pSizeMax, long pCount) throws IOException {
throw new IOException(new SizeLimitExceededException("Reached maximum bytes to parse : " + maxBytes));
}
};
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
final XmlStreamReader reader = new XmlStreamReader(limitedSource, mimeType, true, charsetName);
final InputSource saxSource = new InputSource(reader);
final String detectedCharset = reader.getEncoding();
final SAXParser saxParser = getParser();
boolean limitExceeded = false;
try {
saxParser.parse(saxSource, saxHandler);
} catch(SAXException | IOException e) {
if(!(e.getCause() instanceof SizeLimitExceededException)) {
/* Only transmit to upper layer exceptions that are not caused by the maxLinks or maxBytes limits being reached */
throw e;
}
limitExceeded = true;
}
/* create the parsed document with empty text content */
Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
docs[0].setPartiallyParsed(limitExceeded);
return docs;
} catch (final Exception e) {
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
}
}
}

@ -375,14 +375,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
/**
* Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null.
* Try to detect and parse absolute URLs in text (at most maxURLs) , then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null.
* @param text the text to parse
* @param urls a mutable collection of URLs to fill.
* @param listeners a collection of listeners to trigger.
* @param maxURLs maximum URLs number to add to the urls collection. Be careful with urls collection capacity when this collection is not null and maxURLs value is beyond Integer.MAX_VALUE.
* @return the number of well formed URLs detected
*/
public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
public static long findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners, final long maxURLs) {
if(text == null) {
return;
return 0;
}
int schemePosition, offset = 0;
boolean hasWhiteSpace;
@ -391,8 +393,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final Matcher urlSchemeMatcher = protp.matcher(text);
final Matcher whiteSpaceMatcher = WHITESPACE_PATTERN.matcher(text);
while (offset < text.length()) {
long detectedURLsCount = 0;
while (offset < text.length() && detectedURLsCount < maxURLs) {
if(!urlSchemeMatcher.find(offset)) {
break;
}
@ -413,6 +415,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
offset = schemePosition + urlString.length();
try {
url = new AnchorURL(urlString);
detectedURLsCount++;
if(urls != null) {
urls.add(url);
}
@ -423,6 +426,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
} catch (final MalformedURLException ignored) {}
}
return detectedURLsCount;
}
/**
* Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter can be null.
* @param text the text to parse
* @param urls a mutable collection of URLs to fill.
* @param listeners a collection of listeners to trigger.
*/
public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
findAbsoluteURLs(text, urls, listeners, Long.MAX_VALUE);
}
/**

@ -26,6 +26,8 @@ import java.io.IOException;
import java.io.Writer;
import java.util.Collection;
import javax.naming.SizeLimitExceededException;
import org.apache.commons.io.input.ClosedInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
@ -51,6 +53,12 @@ public class GenericXMLContentHandler extends DefaultHandler {
/** Detected URLs */
private final Collection<AnchorURL> urls;
/** Maximum number of URLs to parse */
private final int maxURLs;
/** Number of parsed URLs in the document */
private long detectedURLs;
/** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */
private StringBuilder currentElementText;
@ -62,7 +70,7 @@ public class GenericXMLContentHandler extends DefaultHandler {
/** Set to false until some text is detected in at least one element of the document */
private boolean documentHasText;
/**
* @param out
* the output writer to write extracted text. Must not be null.
@ -71,6 +79,18 @@ public class GenericXMLContentHandler extends DefaultHandler {
* when out is null
*/
public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls) throws IllegalArgumentException {
this(out, urls, Integer.MAX_VALUE);
}
/**
* @param out
* the output writer to write extracted text. Must not be null.
* @param urls the mutable collection of URLs to fill with detected URLs
* @param maxURLs the maximum number of urls to parse
* @throws IllegalArgumentException
* when out is null
*/
public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls, final int maxURLs) throws IllegalArgumentException {
if (out == null) {
throw new IllegalArgumentException("out writer must not be null");
}
@ -79,6 +99,8 @@ public class GenericXMLContentHandler extends DefaultHandler {
}
this.out = out;
this.urls = urls;
this.maxURLs = maxURLs;
this.detectedURLs = 0;
}
/**
@ -96,10 +118,12 @@ public class GenericXMLContentHandler extends DefaultHandler {
this.lastAppendedIsSpace = false;
this.currentElementTextChunks = 0;
this.documentHasText = false;
this.detectedURLs = 0;
}
/**
* Try to detect URLs eventually contained in attributes
* @throws SAXException when the calling parser reached the maximum bytes limit on the input source
*/
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
@ -109,19 +133,25 @@ public class GenericXMLContentHandler extends DefaultHandler {
if (attributes != null) {
for (int i = 0; i < attributes.getLength(); i++) {
String attribute = attributes.getValue(i);
ContentScraper.findAbsoluteURLs(attribute, this.urls, null);
this.detectedURLs += ContentScraper.findAbsoluteURLs(attribute, this.urls, null, this.maxURLs - this.detectedURLs);
if (this.detectedURLs >= this.maxURLs) {
throw new SAXException(
new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs));
}
}
}
}
/**
* Write characters to the output writer
* @throws SAXException when the calling parser reached the maximum bytes limit on the input source
*/
@Override
public void characters(final char ch[], final int start, final int length) {
public void characters(final char ch[], final int start, final int length) throws SAXException {
try {
if(this.currentElementTextChunks == 0 && this.documentHasText) {
/* We are on the first text chunk of the element, or the first text chunk after processing nested elements :
/* We are but on the first text chunk of the element (not on the first text chunk of the whole document),
* or on the first text chunk after processing nested elements :
* if necessary we add a space to separate text content of different elements */
if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) {
this.out.write(" ");
@ -137,8 +167,8 @@ public class GenericXMLContentHandler extends DefaultHandler {
this.documentHasText = true;
this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
} catch (final IOException ignored) {
ConcurrentLog.logException(ignored);
}
}
@ -148,7 +178,10 @@ public class GenericXMLContentHandler extends DefaultHandler {
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null);
this.detectedURLs += ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), this.urls, null, this.maxURLs - this.detectedURLs);
if (this.detectedURLs >= this.maxURLs) {
throw new SAXException(new SizeLimitExceededException("Reached maximum URLs to parse : " + this.maxURLs));
}
this.currentElementText.setLength(0);
this.currentElementTextChunks = 0;
}
@ -158,5 +191,5 @@ public class GenericXMLContentHandler extends DefaultHandler {
/* Release the StringBuilder now useless */
this.currentElementText = null;
}
}

@ -377,6 +377,7 @@ public final class FileUtils {
* Read the specified amount of bytes from a source stream.
* Important : it is the responsibility of the caller to close the stream.
* @param source InputStream instance. Must not be null
* @param count maximum amount of bytes to read. A negative value means no limit.
* @return source content as a byte array.
* @throws IOException when a read/write error occurred
* @throws NullPointerException when source parameter is null

@ -439,7 +439,11 @@ public final class LoaderDispatcher {
}
}
private int protocolMaxFileSize(final DigestURL url) {
/**
* @param url the URL of a resource to load
* @return the crawler configured maximum size allowed to load for the protocol of the URL
*/
public int protocolMaxFileSize(final DigestURL url) {
if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
if (url.isFTP())
@ -583,7 +587,7 @@ public final class LoaderDispatcher {
* @throws IOException when the content can not be fetched or no parser support it
*/
public Document loadDocumentAsStream(final DigestURL location, final CacheStrategy cachePolicy,
BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
// load resource
Request request = request(location, true, false);
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent);
@ -611,6 +615,65 @@ public final class LoaderDispatcher {
throw new IOException(e.getMessage());
}
}
/**
* Similar to the loadDocument method, but streaming the resource content
* when possible instead of fully loading it in memory.<br>
* Also try to limit the parser processing with a maximum total number of
* links detection (anchors, images links, media links...) or a maximum
* amount of content bytes to parse.<br>
* Limits apply only when the available parsers for the resource media type
* support parsing within limits (see
* {@link Parser#isParseWithLimitsSupported()}. When available parsers do
* not support parsing within limits, an exception is thrown when
* content size is beyond maxBytes.
*
* @param location
* URL of the resource to load
* @param cachePolicy
* cache policy strategy
* @param blacklistType
* blacklist to use
* @param agent
* user agent identifier
* @param maxLinks
* the maximum total number of links to parse and add to the
* result document
* @param maxBytes
* the maximum number of content bytes to process
* @return on parsed document or null when an error occurred while parsing
* @throws IOException
* when the content can not be fetched or no parser support it
*/
public Document loadDocumentAsLimitedStream(final DigestURL location, final CacheStrategy cachePolicy,
final BlacklistType blacklistType, final ClientIdentification.Agent agent, final int maxLinks, final long maxBytes) throws IOException {
// load resource
Request request = request(location, true, false);
final StreamResponse streamResponse = this.openInputStream(request, cachePolicy, blacklistType, agent, -1);
final Response response = streamResponse.getResponse();
final DigestURL url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error
if (streamResponse.getContentStream() == null || response.getResponseHeader() == null) {
throw new IOException("no Content available for url " + url);
}
// parse resource
try {
Document[] documents = streamResponse.parseWithLimits(maxLinks, maxBytes);
Document merged = Document.mergeDocuments(location, response.getMimeType(), documents);
String x_robots_tag = response.getResponseHeader().getXRobotsTag();
if (x_robots_tag.indexOf("noindex",0) >= 0) {
merged.setIndexingDenied(true);
}
return merged;
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
}
}
/**
* load all links from a resource

@ -25,6 +25,7 @@ package net.yacy.document.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.assertFalse;
import java.io.ByteArrayInputStream;
import java.io.File;
@ -358,5 +359,103 @@ public class GenericXMLParserTest {
inStream.close();
}
}
/**
* Test URLs detection when applying limits.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseWithLimits() throws Exception {
String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+ "<title>XHTML content URLs test</title>" + "</head>" + "<body>" + "<p>Here are some YaCy URLs: "
+ "Home page : http://yacy.net - International Forum : "
+ "http://forum.yacy.de "
+ "and this is a mention to a relative URL : /document.html</p>"
+ "<p>Here are YaCy<a href=\"http://mantis.tokeek.de\">bug tracker</a> and <a href=\"http://www.yacy-websearch.net/wiki/\">Wiki</a>."
+ "And this is a relative link to another <a href=\"/document2.html\">sub document</a></p>"
+ "</body>" + "</html>";
/* Content within limits */
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
final String contentTypeHeader = "text/xhtml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, Long.MAX_VALUE);
assertEquals(1, documents.length);
assertFalse(documents[0].isPartiallyParsed());
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(5, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://mantis.tokeek.de")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.yacy-websearch.net/wiki/")));
} finally {
inStream.close();
}
/* Links limit exceeded */
inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
try {
Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream, 2, Long.MAX_VALUE);
assertEquals(1, documents.length);
assertTrue(documents[0].isPartiallyParsed());
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(2, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
} finally {
inStream.close();
}
/* Bytes limit exceeded */
StringBuilder xhtmlBuilder = new StringBuilder("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")
.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">")
.append("<html xmlns=\"http://www.w3.org/1999/xhtml\">")
.append("<head>")
.append("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />")
.append("<title>XHTML content URLs test</title>")
.append("</head>")
.append("<body><p>Here are some YaCy URLs: ")
.append("Home page : http://yacy.net - International Forum : ")
.append("http://forum.yacy.de ")
.append("and this is a mention to a relative URL : /document.html</p>");
/* Add some filler text to reach a total size beyond SAX parser internal input stream buffers */
while(xhtmlBuilder.length() < 1024 * 10) {
xhtmlBuilder.append("<p>Some text to parse</p>");
}
int firstBytes = xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()).length;
xhtmlBuilder.append("<p>Here are YaCy<a href=\"http://mantis.tokeek.de\">bug tracker</a> and <a href=\"http://www.yacy-websearch.net/wiki/\">Wiki</a>.")
.append("And this is a relative link to another <a href=\"/document2.html\">sub document</a></p>")
.append("</body></html>");
inStream = new ByteArrayInputStream(xhtmlBuilder.toString().getBytes(StandardCharsets.UTF_8.name()));
try {
Document[] documents = this.parser.parseWithLimits(location, contentTypeHeader, charsetFromHttpHeader, new VocabularyScraper(), 0, inStream, Integer.MAX_VALUE, firstBytes);
assertEquals(1, documents.length);
assertTrue(documents[0].isPartiallyParsed());
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(3, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
} finally {
inStream.close();
}
}
}

@ -267,6 +267,32 @@ public class ContentScraperTest {
Assert.assertEquals(0, detectedURLs.size());
}
/**
* Test absolute URLs detection in plain text with maxURLs parameter
* @throws MalformedURLException should not happen
*/
@Test
public void testFindAbsoluteURLsMaxURLs() throws MalformedURLException {
final String text = "Some test URLS : http://yacy.net - http://forum.yacy.de - https://en.wikipedia.org";
/* No limit */
ArrayList<AnchorURL> detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null, Long.MAX_VALUE);
Assert.assertEquals(3, detectedURLs.size());
/* Test from zero limit, to limit value equals to the total number of URLs in text */
for(int limit = 0; limit <=3; limit++) {
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null, limit);
Assert.assertEquals(limit, detectedURLs.size());
}
/* Limit greater than total number of URLs in text */
detectedURLs = new ArrayList<>();
ContentScraper.findAbsoluteURLs(text, detectedURLs, null, 4);
Assert.assertEquals(3, detectedURLs.size());
}
/**
* Test unpaired brackets cleaning
*/

Loading…
Cancel
Save