Added a generic XML parser, able to parse elements text and URLs.

This parser adds support for any XML based format other than already supported XML vocabularies such XHTML, RSS/Atom feeds... It will eventually be used as a fallback if one of these specific parsers fail, before falling back to the existing genericParser which extracts not that much useful information except URL tokens.
8 years ago · 319231a458
parent aeeb8a7dd5
commit 319231a458
8 changed files with 787 additions and 24 deletions
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -37,6 +37,7 @@ import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.util.CommonPattern;
+import net.yacy.document.parser.GenericXMLParser;
 import net.yacy.document.parser.apkParser;
 import net.yacy.document.parser.audioTagParser;
 import net.yacy.document.parser.bzipParser;
@ -73,6 +74,10 @@ public final class TextParser {
    private static final Object v = new Object();

    private static final Parser genericIdiom = new genericParser();
+    
+    /** A generic XML parser instance */
+    private static final Parser genericXMLIdiom = new GenericXMLParser();
+    
    //use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
    private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
    private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
@ -112,7 +117,9 @@ public final class TextParser {
        initParser(new xlsParser());
        initParser(new zipParser());
        initParser(new audioTagParser());
-        
+        /* Order is important : the generic XML parser must be initialized in last, so it will be effectively used only as a fallback one
+         * when a specialized parser exists for any XML based format (examples : rssParser or ooxmlParser must be tried first) */
+        initParser(genericXMLIdiom);
    }

    public static Set<Parser> parsers() {
@ -426,7 +433,7 @@ public final class TextParser {
            if (idiom != null) idioms.addAll(idiom);
        }

-        // check extension and add as backup (in case no, wrong or unknown/unsupported mime was suppied)
+        // check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
        String ext = MultiProtocolURL.getFileExtension(url.getFileName());
        if (ext != null && ext.length() > 0) {
            if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
@ -441,6 +448,12 @@ public final class TextParser {
        if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
            idioms.addAll(idiom);
        }
+        
+        /* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser 
+         * (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
+        if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
+        	idioms.add(genericXMLIdiom);
+        }

        // always add the generic parser (make sure it is the last in access order)
        idioms.add(genericIdiom);
@ -456,10 +469,20 @@ public final class TextParser {
     * @return an error if the mime type is not supported, null otherwise
     */
    public static String supportsMime(String mimeType) {
-        if (mimeType == null) return null;
+        if (mimeType == null) {
+        	return null;
+        }
        mimeType = normalizeMimeType(mimeType);
-        if (denyMime.containsKey(mimeType)) return "mime type '" + mimeType + "' is denied (2)";
-        if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
+        if (denyMime.containsKey(mimeType)) {
+        	return "mime type '" + mimeType + "' is denied (2)";
+        }
+        if (mime2parser.get(mimeType) == null) {
+            /* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser 
+             * (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
+        	if(!mimeType.endsWith("+xml")) {
+        		return "no parser for mime '" + mimeType + "' available";
+        	}
+        }
        return null;
    }

--- a/source/net/yacy/document/parser/GenericXMLParser.java
+++ b/source/net/yacy/document/parser/GenericXMLParser.java
@ -0,0 +1,144 @@
+// GenericXMLParser.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.List;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.apache.commons.io.input.XmlStreamReader;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.VocabularyScraper;
+import net.yacy.document.parser.xml.GenericXMLContentHandler;
+import net.yacy.kelondro.io.CharBuffer;
+import net.yacy.kelondro.util.Formatter;
+import net.yacy.kelondro.util.MemoryControl;
+
+/**
+ * A generic XML parser without knowledge of the specific XML vocabulary.
+ * @author luccioman
+ *
+ */
+public class GenericXMLParser extends AbstractParser implements Parser {
+	
+	/** SAX parser instance local to each thread */
+    private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
+    
+    /**
+     * @return a SAXParser instance for the current thread
+     * @throws SAXException when an error prevented parser creation
+     */
+    private static SAXParser getParser() throws SAXException {
+    	SAXParser parser = tlSax.get();
+    	if (parser == null) {
+    		try {
+				parser = SAXParserFactory.newInstance().newSAXParser();
+			} catch (final ParserConfigurationException e) {
+				throw new SAXException(e.getMessage(), e);
+			}
+    		tlSax.set(parser);
+    	}
+    	return parser;
+    }
+
+    public GenericXMLParser() {
+        super("XML Parser");
+        this.SUPPORTED_EXTENSIONS.add("xml");
+        this.SUPPORTED_MIME_TYPES.add("application/xml");
+        this.SUPPORTED_MIME_TYPES.add("text/xml");
+    }
+
+    @Override
+    public Document[] parse(
+            final DigestURL location,
+            final String mimeType,
+            final String charset,
+            final VocabularyScraper scraper, 
+            final int timezoneOffset,
+            final InputStream source)
+            throws Failure, InterruptedException {
+    	
+    	/* Limit the size of the in-memory buffer to at most 25% of the available memory :
+    	 * because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array. 
+    	 * Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
+    	final long availableMemory = MemoryControl.available();
+    	final long maxBytes = (long)(availableMemory * 0.25);
+    	final int maxChars;
+    	if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
+    		maxChars = Integer.MAX_VALUE;
+    	} else {
+    		maxChars = ((int)maxBytes) / Character.BYTES;
+    	}
+    	
+        try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
+
+        	/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
+        	 * (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters),  */
+        	final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
+			final InputSource saxSource = new InputSource(reader);
+			final String detectedCharset = reader.getEncoding();
+
+			final List<AnchorURL> detectedURLs = new ArrayList<>();
+
+			final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
+			final SAXParser saxParser = getParser();
+			saxParser.parse(saxSource, saxHandler);
+
+			if (writer.isOverflow()) {
+				throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+						+ Formatter.bytesToString(availableMemory), location);
+			}
+
+			/* create the parsed document */
+			Document[] docs = null;
+			final byte[] contentBytes = UTF8.getBytes(writer.toString());
+			docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
+					null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
+			return docs;
+		} catch (final Exception e) {
+			if (e instanceof InterruptedException) {
+				throw (InterruptedException) e;
+			}
+			if (e instanceof Parser.Failure) {
+				throw (Parser.Failure) e;
+			}
+
+			throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
+		}
+
+	}
+
+}
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -344,24 +344,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            if ((b.length() != 0) && (!(SentenceReader.punctuation(b.charAt(b.length() - 1))))) b = b + '.';
            //System.out.println("*** Appended dot: " + b.toString());
        }
-        // find http links inside text
-        s = 0;
-        String u;
-        while (s < b.length()) {
-            p = find(b, dpssp, s);
-            if (p == Integer.MAX_VALUE) break;
-            s = Math.max(0, p - 5);
-            p = find(b, protp, s);
-            if (p == Integer.MAX_VALUE) break;
-            q = b.indexOf(" ", p + 1);
-            u = b.substring(p, q < 0 ? b.length() : q);
-            if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
-            s = p + 6;
-            try {
-                this.addAnchor(new AnchorURL(u));
-                continue;
-            } catch (final MalformedURLException e) {}
+        // find absolute URLs inside text
+        final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
+        List<ContentScraperListener> anchorListeners = new ArrayList<>();
+        for (int i = 0; i < listeners.length; i += 2) {
+            if (listeners[i] == ContentScraperListener.class) {
+            	anchorListeners.add((ContentScraperListener)listeners[i+1]);
+            }
        }
+        
+        findAbsoluteURLs(b, this.anchors, anchorListeners);
+        
        // append string to content
        if (!b.isEmpty()) {
            this.content.append(b);
@ -371,6 +364,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {

    private final static Pattern dpssp = Pattern.compile("://");
    private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
+    
+    /**
+     * Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null. 
+     * @param text the text to parse
+     * @param urls a mutable collection of URLs to fill.
+     * @param listeners a collection of listeners to trigger.
+     */
+    public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
+        if(text == null) {
+        	return;
+        }
+        int schemePosition, spacePosition, offset = 0;
+        String urlString;
+        AnchorURL url;
+        while (offset < text.length()) {
+            schemePosition = find(text, dpssp, offset);
+            if (schemePosition == Integer.MAX_VALUE) {
+            	break;
+            }
+            offset = Math.max(0, schemePosition - 5);
+            schemePosition = find(text, protp, offset);
+            if (schemePosition == Integer.MAX_VALUE) {
+            	break;
+            }
+            spacePosition = text.indexOf(" ", schemePosition + 1);
+            urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
+            if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
+            offset = schemePosition + 6;
+            try {
+            	url = new AnchorURL(urlString);
+            	if(urls != null) {
+            		urls.add(url);
+            	}
+            	if(listeners != null) {
+            		for(ContentScraperListener listener : listeners) {
+            			listener.anchorAdded(url.toNormalform(false));
+            		}
+            	}
+            } catch (final MalformedURLException ignored) {}
+        }
+    }

    private static final int find(final String s, final Pattern m, final int start) {
        final Matcher mm = m.matcher(s.subSequence(start, s.length()));
--- a/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java
+++ b/source/net/yacy/document/parser/xml/GenericXMLContentHandler.java
@ -0,0 +1,162 @@
+// GenericXMLContentHandler.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser.xml;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.Collection;
+
+import org.apache.commons.io.input.ClosedInputStream;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.document.parser.html.ContentScraper;
+
+/**
+ * SAX handler for XML contents, only extracting text and eventual URLs from
+ * XML.
+ * 
+ * @author luccioman
+ *
+ */
+public class GenericXMLContentHandler extends DefaultHandler {
+
+	/** Output writer */
+	private final Writer out;
+
+	/** Detected URLs */
+	private final Collection<AnchorURL> urls;
+	
+	/** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */
+	private StringBuilder currentElementText;
+	
+	/** Set to true when the last character written to the output writer is a space */
+	private boolean lastAppendedIsSpace;
+	
+	/** The number of text chunks handled in the current element (reset to zero when the element has nested elements) */
+	private int currentElementTextChunks;
+	
+	/** Set to false until some text is detected in at least one element of the document */
+	private boolean documentHasText;
+
+	/**
+	 * @param out
+	 *            the output writer to write extracted text. Must not be null.
+	 * @param urls the mutable collection of URLs to fill with detected URLs
+	 * @throws IllegalArgumentException
+	 *             when out is null
+	 */
+	public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls) throws IllegalArgumentException {
+		if (out == null) {
+			throw new IllegalArgumentException("out writer must not be null");
+		}
+		if (urls == null) {
+			throw new IllegalArgumentException("urls collection must not be null");
+		}
+		this.out = out;
+		this.urls = urls;
+	}
+
+	/**
+	 * @return an empty source to prevent the SAX parser opening an unwanted
+	 *         connection to resolve an external entity
+	 */
+	@Override
+	public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException {
+		return new InputSource(new ClosedInputStream());
+	}
+	
+	@Override
+	public void startDocument() throws SAXException {
+		this.currentElementText = new StringBuilder();
+		this.lastAppendedIsSpace = false;
+		this.currentElementTextChunks = 0;
+		this.documentHasText = false;
+	}
+
+	/**
+	 * Try to detect URLs eventually contained in attributes
+	 */
+	@Override
+	public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+		this.currentElementText.setLength(0);
+		this.currentElementTextChunks = 0;
+
+		if (attributes != null) {
+			for (int i = 0; i < attributes.getLength(); i++) {
+				String attribute = attributes.getValue(i);
+				ContentScraper.findAbsoluteURLs(attribute, this.urls, null);
+			}
+		}
+	}
+
+	/**
+	 * Write characters to the output writer
+	 */
+	@Override
+	public void characters(final char ch[], final int start, final int length) {
+		try {
+			if(this.currentElementTextChunks == 0 && this.documentHasText) {
+				/* We are on the first text chunk of the element, or the first text chunk after processing nested elements : 
+				 * if necessary we add a space to separate text content of different elements */
+				if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) {
+					this.out.write(" ");
+					this.currentElementText.append(" ");
+				}
+			}
+			
+			this.out.write(ch, start, length);
+			this.currentElementText.append(ch, start, length);
+			
+			if(length > 0) {
+				this.currentElementTextChunks++;
+				this.documentHasText = true;
+				this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]);
+			}
+		} catch (final IOException e) {
+			ConcurrentLog.logException(e);
+		}
+	}
+
+	/**
+	 * When the eventual element text doesn't end with a terminal punctuation character,
+	 * add a period ('.' character) to help future SentenceReader work.
+	 */
+	@Override
+	public void endElement(String uri, String localName, String qName) throws SAXException {
+		ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null);
+		this.currentElementText.setLength(0);
+		this.currentElementTextChunks = 0;
+	}
+	
+	@Override
+	public void endDocument() throws SAXException {
+		/* Release the StringBuilder now useless */
+		this.currentElementText = null;
+	}
+
+}
--- a/source/net/yacy/kelondro/io/CharBuffer.java
+++ b/source/net/yacy/kelondro/io/CharBuffer.java
@ -43,12 +43,16 @@ public final class CharBuffer extends Writer {
    private int offset;
    private int length;
    private final int maximumLength;
+    
+    /** Set to true when write attempts beyond the maximumLength have been tried */
+    private boolean overflow;

    public CharBuffer(final int maximumLength) {
        this.buffer = new char[10];
        this.length = 0;
        this.offset = 0;
        this.maximumLength = maximumLength;
+        this.overflow = false;
    }

    public CharBuffer(final int maximumLength, final int initLength) {
@ -56,6 +60,7 @@ public final class CharBuffer extends Writer {
        this.length = 0;
        this.offset = 0;
        this.maximumLength = maximumLength;
+        this.overflow = false;
    }

    public CharBuffer(final int maximumLength, final char[] bb) {
@ -63,6 +68,7 @@ public final class CharBuffer extends Writer {
        this.length = bb.length;
        this.offset = 0;
        this.maximumLength = maximumLength;
+        this.overflow = false;
    }

    public CharBuffer(final int maximumLength, final char[] bb, final int initLength) {
@ -71,6 +77,7 @@ public final class CharBuffer extends Writer {
        this.length = bb.length;
        this.offset = 0;
        this.maximumLength = maximumLength;
+        this.overflow = false;
    }

    public CharBuffer(final File f) throws IOException {
@ -81,6 +88,7 @@ public final class CharBuffer extends Writer {
        this.length = 0;
        this.buffer = new char[(int) f.length()*2];
        this.offset = 0;
+        this.overflow = false;

        FileReader fr = null;
        try {
@ -102,6 +110,7 @@ public final class CharBuffer extends Writer {
        this.buffer = new char[0];
        this.length = 0;
        this.offset = 0;
+        this.overflow = false;
    }

    public int length() {
@ -111,6 +120,13 @@ public final class CharBuffer extends Writer {
    public boolean isEmpty() {
        return this.length == 0;
    }
+    
+    /**
+     * @return true when write attempts beyond the maximumLength have been tried
+     */
+    public boolean isOverflow() {
+		return this.overflow;
+	}

    private void grow(int minSize) {
        int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
@ -126,7 +142,10 @@ public final class CharBuffer extends Writer {
    }

    public void write(final char b) {
-        if (this.buffer.length > this.maximumLength) return;
+        if (this.buffer.length > this.maximumLength) {
+        	this.overflow = true;
+        	return;
+        }
        if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
        this.buffer[this.offset + this.length++] = b;
    }
@ -138,7 +157,10 @@ public final class CharBuffer extends Writer {

    @Override
    public void write(final char[] bb, final int of, final int le) {
-        if (this.buffer.length > this.maximumLength) return;
+        if (this.buffer.length > this.maximumLength) {
+        	this.overflow = true;
+        	return;
+        }
        if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
        System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
        this.length += le;
--- a/test/java/net/yacy/document/parser/GenericXMLParserTest.java
+++ b/test/java/net/yacy/document/parser/GenericXMLParserTest.java
@ -0,0 +1,362 @@
+// GenericXMLParserTest.java
+// ---------------------------
+// Copyright 2017 by luccioman; https://github.com/luccioman
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// LICENSE
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+package net.yacy.document.parser;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.Collection;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import net.yacy.cora.document.id.AnchorURL;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.document.Document;
+import net.yacy.document.VocabularyScraper;
+
+/**
+ * Unit tests for the {@link GenericXMLParser} class
+ * 
+ * @author luccioman
+ *
+ */
+public class GenericXMLParserTest {
+
+	/** Example test tag including non-ascii characters */
+	private static final String UMLAUT_TEXT_TAG = "<text>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</text>";
+
+	private GenericXMLParser parser;
+
+	@Before
+	public void setUp() {
+		this.parser = new GenericXMLParser();
+	}
+
+	/**
+	 * Unit test for the GenericXMLParser.parse() function with some small XML
+	 * test files.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParse() throws Exception {
+		final String[] fileNames = { "umlaute_dc_xml_iso.xml", "umlaute_dc_xml_utf8.xml" };
+		final File folder = new File("test" + File.separator + "parsertest" + File.separator);
+
+		for (String fileName : fileNames) {
+			FileInputStream inStream = new FileInputStream(new File(folder, fileName));
+			DigestURL location = new DigestURL("http://localhost/" + fileName);
+			try {
+				Document[] documents = this.parser.parse(location, "text/xml", null, new VocabularyScraper(), 0,
+						inStream);
+				assertNotNull("Parser result must not be null for file " + fileName, documents);
+				assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
+				assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
+						documents[0].getTextString().contains("Maßkrügen"));
+			} finally {
+				inStream.close();
+			}
+		}
+	}
+
+	/**
+	 * 
+	 * @param parser
+	 *            generic xml parser instance. Must not be null.
+	 * @param encodedXML
+	 *            xml encoded bytes to test
+	 * @param contentTypeHeader
+	 *            Content-Type header value
+	 * @param expectedCharset
+	 *            expected character set name to be detected
+	 * @param expectedConntainedText
+	 *            expected text to be contained in the parsed text
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	private void testCharsetDetection(final GenericXMLParser parser, final byte[] encodedXML,
+			final String contentTypeHeader, final String expectedCharset, final String expectedConntainedText)
+			throws Exception {
+		InputStream inStream = new ByteArrayInputStream(encodedXML);
+		String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
+		DigestURL location = new DigestURL("http://localhost/testfile.xml");
+		try {
+			Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
+					new VocabularyScraper(), 0, inStream);
+			assertEquals(expectedCharset, documents[0].getCharset());
+			assertNotNull(documents[0].getTextString());
+			assertTrue(documents[0].getTextString().contains(expectedConntainedText));
+		} finally {
+			inStream.close();
+		}
+	}
+
+	/**
+	 * Test UTF-8 charset detection
+	 * 
+	 * @see RFC 7303 "UTF-8 Charset" example
+	 *      (https://tools.ietf.org/html/rfc7303#section-8.1)
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseUTF8Charset() throws Exception {
+		/*
+		 * UTF-8 charset provided both in Content-Type HTTP header and in XML
+		 * declaration
+		 */
+		byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-8\"?>" + UMLAUT_TEXT_TAG)
+				.getBytes(StandardCharsets.UTF_8);
+		testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(),
+				"Maßkrügen");
+
+		/*
+		 * Charset provided in Content-Type HTTP header but omitted in XML
+		 * declaration
+		 */
+		encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
+		testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(),
+				"Maßkrügen");
+	}
+
+	/**
+	 * Test UTF-16 charset detection
+	 * 
+	 * @see RFC 7303 "UTF-16 Charset" and
+	 *      "Omitted Charset and 16-Bit MIME Entity" examples
+	 *      (https://tools.ietf.org/html/rfc7303#section-8.2 and
+	 *      https://tools.ietf.org/html/rfc7303#section-8.4)
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseUTF16Charset() throws Exception {
+		/*
+		 * UTF-16 charset provided both in Content-Type HTTP header and in XML
+		 * declaration with BOM (Byte Order Mark)
+		 */
+		byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
+				.getBytes(StandardCharsets.UTF_16);
+		testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16", StandardCharsets.UTF_16.name(),
+				"Maßkrügen");
+
+		/*
+		 * UTF-16 charset provided in Content-Type HTTP header but omitted in
+		 * XML declaration having only BOM (Byte Order Mark)
+		 */
+		encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
+		testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16",
+				StandardCharsets.UTF_16BE.name(), "Maßkrügen");
+
+		/*
+		 * Charset is omitted in Content-Type HTTP header, but provided in the
+		 * XML declaration with BOM (Byte Order Mark)
+		 */
+		encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
+				.getBytes(StandardCharsets.UTF_16);
+		testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16.name(), "Maßkrügen");
+
+		/*
+		 * Charset is omitted in both Content-Type HTTP header and XML
+		 * declaration with BOM (Byte Order Mark)
+		 */
+		encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
+		testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16BE.name(), "Maßkrügen");
+	}
+
+	/**
+	 * Test ISO-8859-1 charset detection
+	 * 
+	 * @see RFC 7303 "Omitted Charset and 8-Bit MIME Entity" example
+	 *      (https://tools.ietf.org/html/rfc7303#section-8.3)
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseISO_8859_1Charset() throws Exception {
+		/*
+		 * ISO-8859-1 charset provided only in XML declaration without BOM (Byte
+		 * Order Mark)
+		 */
+		byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" + UMLAUT_TEXT_TAG)
+				.getBytes(StandardCharsets.ISO_8859_1);
+		testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.ISO_8859_1.name(),
+				"Maßkrügen");
+	}
+
+	/**
+	 * Test charset detection when the character encoding is omitted in
+	 * Content-Type header, and content has a XML declaration with no encoding
+	 * declaration
+	 * 
+	 * @see RFC 7303 "Omitted Charset, No Internal Encoding Declaration" example
+	 *      (https://tools.ietf.org/html/rfc7303#section-8.5)
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseOmittedCharsetNoInternalEncoding() throws Exception {
+		/*
+		 * XML encoded as UTF-8 without BOM (Byte Order Mark)
+		 */
+		byte[] encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
+		testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen");
+
+		/*
+		 * XML encoded as ASCII, with non ascii chars encoded as entities
+		 */
+		encodedXML = ("<?xml version=\"1.0\"?>"
+				+ "<text>In M&#x000FC;nchen steht ein Hofbr&#x000E4;uhaus, dort gibt es Bier in Ma&#x000DF;kr&#x000FC;gen</text>")
+						.getBytes(StandardCharsets.US_ASCII);
+		testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen");
+	}
+
+	/**
+	 * Test UTF-16BE charset detection
+	 * 
+	 * @see RFC 7303 "UTF-16BE Charset" example
+	 *      (https://tools.ietf.org/html/rfc7303#section-8.6)
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseUTF8_16BECharset() throws Exception {
+		/*
+		 * UTF-16BE charset provided both in Content-Type HTTP header and in XML
+		 * declaration, without BOM (Byte Order Mark)
+		 */
+		byte[] encodedXML = ("<?xml version='1.0' encoding='utf-16be'?>" + UMLAUT_TEXT_TAG)
+				.getBytes(StandardCharsets.UTF_16BE);
+		testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16be",
+				StandardCharsets.UTF_16BE.name(), "Maßkrügen");
+	}
+
+	/**
+	 * Test absolute URLs detection in XML elements attributes.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseAttributeURLs() throws Exception {
+		final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+				+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+				+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+				+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+				+ "<title>XHTML attributes URLs test</title>" + "</head>" + "<body>"
+				+ "Here are YaCy<a href=\"http://yacy.net\">home page</a> and <a href=\"http://forum.yacy.de\">International Forum</a>."
+				+ "And this is a relative link to a <a href=\"/document.html\">sub document</a>." + "</body>"
+				+ "</html>";
+
+		InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
+		final String contentTypeHeader = "text/xhtml";
+		String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
+		DigestURL location = new DigestURL("http://localhost/testfile.xml");
+		try {
+			Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
+					new VocabularyScraper(), 0, inStream);
+			assertEquals(1, documents.length);
+			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals(3, detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
+		} finally {
+			inStream.close();
+		}
+	}
+
+	/**
+	 * Test absolute URLs detection in XML elements text.
+	 * 
+	 * @throws Exception
+	 *             when an unexpected error occurred
+	 */
+	@Test
+	public void testParseContentURLs() throws Exception {
+		final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+				+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+				+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+				+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+				+ "<title>XHTML content URLs test</title>" + "</head>" + "<body>" + "Here are some YaCy links:" + "<dl>"
+				+ "<dt>Home page</dt>" + "<dd>http://yacy.net</dd>" + "<dt>International Forum</dt>"
+				+ "<dd>http://forum.yacy.de</dd>" + "</dl>"
+				+ "And this is a mention to a relative link : /document.html " + "</body>" + "</html>";
+
+		InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
+		final String contentTypeHeader = "text/xhtml";
+		String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
+		DigestURL location = new DigestURL("http://localhost/testfile.xml");
+		try {
+			Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
+					new VocabularyScraper(), 0, inStream);
+			assertEquals(1, documents.length);
+			Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
+			assertNotNull(detectedAnchors);
+			assertEquals(3, detectedAnchors.size());
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
+			assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
+		} finally {
+			inStream.close();
+		}
+	}
+	
+	/**
+	 * Test parsing well-formed XML fragment (no XML declaration, no DTD or schema)
+	 * @throws Exception when an unexpected error occurred
+	 */
+	@Test
+	public void testParseXMLFragment() throws Exception {
+		final String xhtml = "<root><node><subNode1>Node content1</subNode1><subNode2>Node content2</subNode2></node></root>";
+
+		InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
+		final String contentTypeHeader = "text/xml";
+		String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
+		DigestURL location = new DigestURL("http://localhost/testfile.xml");
+		try {
+			Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
+					new VocabularyScraper(), 0, inStream);
+			assertEquals(1, documents.length);
+			assertEquals("Node content1 Node content2", documents[0].getTextString());
+		} finally {
+			inStream.close();
+		}		
+	}
+
+}
--- a/test/parsertest/umlaute_dc_xml_iso.xml
+++ b/test/parsertest/umlaute_dc_xml_iso.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="iso-8859-1"?>
+<!DOCTYPE rdf:RDF SYSTEM "http://dublincore.org/2000/12/01-dcmes-xml-dtd.dtd">
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <rdf:Description>
+    <dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
+  </rdf:Description>
+</rdf:RDF>
--- a/test/parsertest/umlaute_dc_xml_utf8.xml
+++ b/test/parsertest/umlaute_dc_xml_utf8.xml
@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE rdf:RDF SYSTEM "http://dublincore.org/2000/12/01-dcmes-xml-dtd.dtd">
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <rdf:Description>
+    <dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
+  </rdf:Description>
+</rdf:RDF>