From acab6a6defb3307d27fb97004f99e0be4be8f55f Mon Sep 17 00:00:00 2001
From: luccioman <luccioman@users.noreply.github.com>
Date: Mon, 14 Aug 2017 14:47:01 +0200
Subject: [PATCH] Also handle text content when parsing XML within limits.

---
 source/net/yacy/document/parser/GenericXMLParser.java  | 10 ++++++++--
 .../net/yacy/document/parser/GenericXMLParserTest.java |  8 ++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)
diff --git a/source/net/yacy/document/parser/GenericXMLParser.java b/source/net/yacy/document/parser/GenericXMLParser.java
index 0673260e6..25d429143 100644
--- a/source/net/yacy/document/parser/GenericXMLParser.java
+++ b/source/net/yacy/document/parser/GenericXMLParser.java
@@ -193,11 +193,17 @@ public class GenericXMLParser extends AbstractParser implements Parser {
 			} catch(StreamLimitException e) {
 				limitExceeded = true;
 			}
+			
+			if (writer.isOverflow()) {
+				throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+						+ Formatter.bytesToString(availableMemory), location);
+			}
 
 
-			/* create the parsed document with empty text content */
+			/* Create the parsed document with eventually only partial part of the text and links */
+			final byte[] contentBytes = UTF8.getBytes(writer.toString());
 			Document[] docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
-					null, null, 0.0d, 0.0d, new byte[0], detectedURLs, null, null, false, new Date()) };
+					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
 			docs[0].setPartiallyParsed(limitExceeded);
 			return docs;
 		} catch (final Exception e) {
diff --git a/test/java/net/yacy/document/parser/GenericXMLParserTest.java b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
index d4d6affe4..18b6cb438 100644
--- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java
+++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
@@ -390,6 +390,8 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertFalse(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(5, detectedAnchors.size());
@@ -410,6 +412,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("Home page"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(2, detectedAnchors.size());
@@ -447,6 +452,9 @@ public class GenericXMLParserTest {
 			assertEquals(1, documents.length);
 			assertTrue(documents[0].isPartiallyParsed());
 			
+			assertTrue(documents[0].getTextString().contains("and this is a mention to a relative URL"));
+			assertFalse(documents[0].getTextString().contains("And this is a relative link to another"));
+			
 			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
 			assertNotNull(detectedAnchors);
 			assertEquals(3, detectedAnchors.size());