From acab6a6defb3307d27fb97004f99e0be4be8f55f Mon Sep 17 00:00:00 2001 From: luccioman Date: Mon, 14 Aug 2017 14:47:01 +0200 Subject: [PATCH] Also handle text content when parsing XML within limits. --- source/net/yacy/document/parser/GenericXMLParser.java | 10 ++++++++-- .../net/yacy/document/parser/GenericXMLParserTest.java | 8 ++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java index 0673260e6..25d429143 100644 --- a/source/net/yacy/document/parser/GenericXMLParser.java +++ b/source/net/yacy/document/parser/GenericXMLParser.java @@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser { } catch(StreamLimitException e) { limitExceeded = true; } + + if (writer.isOverflow()) { + throw new Parser.Failure("Not enough Memory available for generic the XML parser : " + + Formatter.bytesToString(availableMemory), location); + } - /* create the parsed document with empty text content */ + /* Create the parsed document with eventually only partial part of the text and links */ + final byte[] contentBytes = UTF8.getBytes(writer.toString()); Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "", - null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) }; + null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) }; docs[0].setPartiallyParsed(limitExceeded); return docs; } catch (final Exception e) { diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java index d4d6affe4..18b6cb438 100644 --- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java +++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java @@ -390,6 +390,8 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertFalse(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("And this is a relative link")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(5, detectedAnchors.size()); @@ -410,6 +412,9 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("Home page")); + assertFalse(documents[0].getTextString().contains("And this is a relative link")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(2, detectedAnchors.size()); @@ -447,6 +452,9 @@ public class GenericXMLParserTest { assertEquals(1, documents.length); assertTrue(documents[0].isPartiallyParsed()); + assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL")); + assertFalse(documents[0].getTextString().contains("And this is a relative link to another")); + Collection detectedAnchors = documents[0].getAnchors(); assertNotNull(detectedAnchors); assertEquals(3, detectedAnchors.size());