Added a generic XML parser, able to parse elements text and URLs.

This parser adds support for any XML based format other than already
supported XML vocabularies such XHTML, RSS/Atom feeds... It will
eventually be used as a fallback if one of these specific parsers fail,
before falling back to the existing genericParser which extracts not
that much useful information except URL tokens.
luccioman 8 years ago
parent aeeb8a7dd5
commit 319231a458

@ -37,6 +37,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.GenericXMLParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@ -73,6 +74,10 @@ public final class TextParser {
private static final Object v = new Object();
private static final Parser genericIdiom = new genericParser();
/** A generic XML parser instance */
private static final Parser genericXMLIdiom = new GenericXMLParser();
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
@ -112,7 +117,9 @@ public final class TextParser {
initParser(new xlsParser());
initParser(new zipParser());
initParser(new audioTagParser());
/* Order is important : the generic XML parser must be initialized in last, so it will be effectively used only as a fallback one
* when a specialized parser exists for any XML based format (examples : rssParser or ooxmlParser must be tried first) */
public static Set<Parser> parsers() {
@ -426,7 +433,7 @@ public final class TextParser {
if (idiom != null) idioms.addAll(idiom);
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was suppied)
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) {
if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
@ -441,6 +448,12 @@ public final class TextParser {
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : */
if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
// always add the generic parser (make sure it is the last in access order)
@ -456,10 +469,20 @@ public final class TextParser {
* @return an error if the mime type is not supported, null otherwise
public static String supportsMime(String mimeType) {
if (mimeType == null) return null;
if (mimeType == null) {
return null;
mimeType = normalizeMimeType(mimeType);
if (denyMime.containsKey(mimeType)) return "mime type '" + mimeType + "' is denied (2)";
if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
if (denyMime.containsKey(mimeType)) {
return "mime type '" + mimeType + "' is denied (2)";
if (mime2parser.get(mimeType) == null) {
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : */
if(!mimeType.endsWith("+xml")) {
return "no parser for mime '" + mimeType + "' available";
return null;

@ -0,0 +1,144 @@
// ---------------------------
// Copyright 2017 by luccioman;
// This is a part of YaCy, a peer-to-peer based web search engine
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.GenericXMLContentHandler;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
* A generic XML parser without knowledge of the specific XML vocabulary.
* @author luccioman
public class GenericXMLParser extends AbstractParser implements Parser {
/** SAX parser instance local to each thread */
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
* @return a SAXParser instance for the current thread
* @throws SAXException when an error prevented parser creation
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
return parser;
public GenericXMLParser() {
super("XML Parser");
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
final long availableMemory = MemoryControl.available();
final long maxBytes = (long)(availableMemory * 0.25);
final int maxChars;
if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
maxChars = Integer.MAX_VALUE;
} else {
maxChars = ((int)maxBytes) / Character.BYTES;
try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
final InputSource saxSource = new InputSource(reader);
final String detectedCharset = reader.getEncoding();
final List<AnchorURL> detectedURLs = new ArrayList<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
final SAXParser saxParser = getParser();
saxParser.parse(saxSource, saxHandler);
if (writer.isOverflow()) {
throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+ Formatter.bytesToString(availableMemory), location);
/* create the parsed document */
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) {
throw (InterruptedException) e;
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);

@ -344,24 +344,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ((b.length() != 0) && (!(SentenceReader.punctuation(b.charAt(b.length() - 1))))) b = b + '.';
//System.out.println("*** Appended dot: " + b.toString());
// find http links inside text
s = 0;
String u;
while (s < b.length()) {
p = find(b, dpssp, s);
if (p == Integer.MAX_VALUE) break;
s = Math.max(0, p - 5);
p = find(b, protp, s);
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 6;
try {
this.addAnchor(new AnchorURL(u));
} catch (final MalformedURLException e) {}
// find absolute URLs inside text
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
List<ContentScraperListener> anchorListeners = new ArrayList<>();
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ContentScraperListener.class) {
findAbsoluteURLs(b, this.anchors, anchorListeners);
// append string to content
if (!b.isEmpty()) {
@ -371,6 +364,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final static Pattern dpssp = Pattern.compile("://");
private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
* Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null.
* @param text the text to parse
* @param urls a mutable collection of URLs to fill.
* @param listeners a collection of listeners to trigger.
public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
if(text == null) {
int schemePosition, spacePosition, offset = 0;
String urlString;
AnchorURL url;
while (offset < text.length()) {
schemePosition = find(text, dpssp, offset);
if (schemePosition == Integer.MAX_VALUE) {
offset = Math.max(0, schemePosition - 5);
schemePosition = find(text, protp, offset);
if (schemePosition == Integer.MAX_VALUE) {
spacePosition = text.indexOf(" ", schemePosition + 1);
urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
offset = schemePosition + 6;
try {
url = new AnchorURL(urlString);
if(urls != null) {
if(listeners != null) {
for(ContentScraperListener listener : listeners) {
} catch (final MalformedURLException ignored) {}
private static final int find(final String s, final Pattern m, final int start) {
final Matcher mm = m.matcher(s.subSequence(start, s.length()));

@ -0,0 +1,162 @@
// ---------------------------
// Copyright 2017 by luccioman;
// This is a part of YaCy, a peer-to-peer based web search engine
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.xml;
import java.util.Collection;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.ContentScraper;
* SAX handler for XML contents, only extracting text and eventual URLs from
* XML.
* @author luccioman
public class GenericXMLContentHandler extends DefaultHandler {
/** Output writer */
private final Writer out;
/** Detected URLs */
private final Collection<AnchorURL> urls;
/** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */
private StringBuilder currentElementText;
/** Set to true when the last character written to the output writer is a space */
private boolean lastAppendedIsSpace;
/** The number of text chunks handled in the current element (reset to zero when the element has nested elements) */
private int currentElementTextChunks;
/** Set to false until some text is detected in at least one element of the document */
private boolean documentHasText;
* @param out
* the output writer to write extracted text. Must not be null.
* @param urls the mutable collection of URLs to fill with detected URLs
* @throws IllegalArgumentException
* when out is null
public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls) throws IllegalArgumentException {
if (out == null) {
throw new IllegalArgumentException("out writer must not be null");
if (urls == null) {
throw new IllegalArgumentException("urls collection must not be null");
this.out = out;
this.urls = urls;
* @return an empty source to prevent the SAX parser opening an unwanted
* connection to resolve an external entity
public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException {
return new InputSource(new ClosedInputStream());
public void startDocument() throws SAXException {
this.currentElementText = new StringBuilder();
this.lastAppendedIsSpace = false;
this.currentElementTextChunks = 0;
this.documentHasText = false;
* Try to detect URLs eventually contained in attributes
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
this.currentElementTextChunks = 0;
if (attributes != null) {
for (int i = 0; i < attributes.getLength(); i++) {
String attribute = attributes.getValue(i);
ContentScraper.findAbsoluteURLs(attribute, this.urls, null);
* Write characters to the output writer
public void characters(final char ch[], final int start, final int length) {
try {
if(this.currentElementTextChunks == 0 && this.documentHasText) {
/* We are on the first text chunk of the element, or the first text chunk after processing nested elements :
* if necessary we add a space to separate text content of different elements */
if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) {
this.out.write(" ");
this.currentElementText.append(" ");
this.out.write(ch, start, length);
this.currentElementText.append(ch, start, length);
if(length > 0) {
this.documentHasText = true;
this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]);
} catch (final IOException e) {
* When the eventual element text doesn't end with a terminal punctuation character,
* add a period ('.' character) to help future SentenceReader work.
public void endElement(String uri, String localName, String qName) throws SAXException {
ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null);
this.currentElementTextChunks = 0;
public void endDocument() throws SAXException {
/* Release the StringBuilder now useless */
this.currentElementText = null;

@ -43,12 +43,16 @@ public final class CharBuffer extends Writer {
private int offset;
private int length;
private final int maximumLength;
/** Set to true when write attempts beyond the maximumLength have been tried */
private boolean overflow;
public CharBuffer(final int maximumLength) {
this.buffer = new char[10];
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
public CharBuffer(final int maximumLength, final int initLength) {
@ -56,6 +60,7 @@ public final class CharBuffer extends Writer {
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
public CharBuffer(final int maximumLength, final char[] bb) {
@ -63,6 +68,7 @@ public final class CharBuffer extends Writer {
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
public CharBuffer(final int maximumLength, final char[] bb, final int initLength) {
@ -71,6 +77,7 @@ public final class CharBuffer extends Writer {
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
public CharBuffer(final File f) throws IOException {
@ -81,6 +88,7 @@ public final class CharBuffer extends Writer {
this.length = 0;
this.buffer = new char[(int) f.length()*2];
this.offset = 0;
this.overflow = false;
FileReader fr = null;
try {
@ -102,6 +110,7 @@ public final class CharBuffer extends Writer {
this.buffer = new char[0];
this.length = 0;
this.offset = 0;
this.overflow = false;
public int length() {
@ -111,6 +120,13 @@ public final class CharBuffer extends Writer {
public boolean isEmpty() {
return this.length == 0;
* @return true when write attempts beyond the maximumLength have been tried
public boolean isOverflow() {
return this.overflow;
private void grow(int minSize) {
int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
@ -126,7 +142,10 @@ public final class CharBuffer extends Writer {
public void write(final char b) {
if (this.buffer.length > this.maximumLength) return;
if (this.buffer.length > this.maximumLength) {
this.overflow = true;
if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
this.buffer[this.offset + this.length++] = b;
@ -138,7 +157,10 @@ public final class CharBuffer extends Writer {
public void write(final char[] bb, final int of, final int le) {
if (this.buffer.length > this.maximumLength) return;
if (this.buffer.length > this.maximumLength) {
this.overflow = true;
if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
this.length += le;

@ -0,0 +1,362 @@
// ---------------------------
// Copyright 2017 by luccioman;
// This is a part of YaCy, a peer-to-peer based web search engine
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import org.junit.Before;
import org.junit.Test;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
* Unit tests for the {@link GenericXMLParser} class
* @author luccioman
public class GenericXMLParserTest {
/** Example test tag including non-ascii characters */
private static final String UMLAUT_TEXT_TAG = "<text>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</text>";
private GenericXMLParser parser;
public void setUp() {
this.parser = new GenericXMLParser();
* Unit test for the GenericXMLParser.parse() function with some small XML
* test files.
* @throws Exception
* when an unexpected error occurred
public void testParse() throws Exception {
final String[] fileNames = { "umlaute_dc_xml_iso.xml", "umlaute_dc_xml_utf8.xml" };
final File folder = new File("test" + File.separator + "parsertest" + File.separator);
for (String fileName : fileNames) {
FileInputStream inStream = new FileInputStream(new File(folder, fileName));
DigestURL location = new DigestURL("http://localhost/" + fileName);
try {
Document[] documents = this.parser.parse(location, "text/xml", null, new VocabularyScraper(), 0,
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
} finally {
* @param parser
* generic xml parser instance. Must not be null.
* @param encodedXML
* xml encoded bytes to test
* @param contentTypeHeader
* Content-Type header value
* @param expectedCharset
* expected character set name to be detected
* @param expectedConntainedText
* expected text to be contained in the parsed text
* @throws Exception
* when an unexpected error occurred
private void testCharsetDetection(final GenericXMLParser parser, final byte[] encodedXML,
final String contentTypeHeader, final String expectedCharset, final String expectedConntainedText)
throws Exception {
InputStream inStream = new ByteArrayInputStream(encodedXML);
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(expectedCharset, documents[0].getCharset());
} finally {
* Test UTF-8 charset detection
* @see RFC 7303 "UTF-8 Charset" example
* (
* @throws Exception
* when an unexpected error occurred
public void testParseUTF8Charset() throws Exception {
* UTF-8 charset provided both in Content-Type HTTP header and in XML
* declaration
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-8\"?>" + UMLAUT_TEXT_TAG)
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8",,
* Charset provided in Content-Type HTTP header but omitted in XML
* declaration
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8",,
* Test UTF-16 charset detection
* @see RFC 7303 "UTF-16 Charset" and
* "Omitted Charset and 16-Bit MIME Entity" examples
* ( and
* @throws Exception
* when an unexpected error occurred
public void testParseUTF16Charset() throws Exception {
* UTF-16 charset provided both in Content-Type HTTP header and in XML
* declaration with BOM (Byte Order Mark)
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16",,
* UTF-16 charset provided in Content-Type HTTP header but omitted in
* XML declaration having only BOM (Byte Order Mark)
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16",, "Maßkrügen");
* Charset is omitted in Content-Type HTTP header, but provided in the
* XML declaration with BOM (Byte Order Mark)
encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
testCharsetDetection(this.parser, encodedXML, "application/xml",, "Maßkrügen");
* Charset is omitted in both Content-Type HTTP header and XML
* declaration with BOM (Byte Order Mark)
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml",, "Maßkrügen");
* Test ISO-8859-1 charset detection
* @see RFC 7303 "Omitted Charset and 8-Bit MIME Entity" example
* (
* @throws Exception
* when an unexpected error occurred
public void testParseISO_8859_1Charset() throws Exception {
* ISO-8859-1 charset provided only in XML declaration without BOM (Byte
* Order Mark)
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" + UMLAUT_TEXT_TAG)
testCharsetDetection(this.parser, encodedXML, "application/xml",,
* Test charset detection when the character encoding is omitted in
* Content-Type header, and content has a XML declaration with no encoding
* declaration
* @see RFC 7303 "Omitted Charset, No Internal Encoding Declaration" example
* (
* @throws Exception
* when an unexpected error occurred
public void testParseOmittedCharsetNoInternalEncoding() throws Exception {
* XML encoded as UTF-8 without BOM (Byte Order Mark)
byte[] encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
testCharsetDetection(parser, encodedXML, "application/xml",, "Maßkrügen");
* XML encoded as ASCII, with non ascii chars encoded as entities
encodedXML = ("<?xml version=\"1.0\"?>"
+ "<text>In M&#x000FC;nchen steht ein Hofbr&#x000E4;uhaus, dort gibt es Bier in Ma&#x000DF;kr&#x000FC;gen</text>")
testCharsetDetection(this.parser, encodedXML, "application/xml",, "Maßkrügen");
* Test UTF-16BE charset detection
* @see RFC 7303 "UTF-16BE Charset" example
* (
* @throws Exception
* when an unexpected error occurred
public void testParseUTF8_16BECharset() throws Exception {
* UTF-16BE charset provided both in Content-Type HTTP header and in XML
* declaration, without BOM (Byte Order Mark)
byte[] encodedXML = ("<?xml version='1.0' encoding='utf-16be'?>" + UMLAUT_TEXT_TAG)
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16be",, "Maßkrügen");
* Test absolute URLs detection in XML elements attributes.
* @throws Exception
* when an unexpected error occurred
public void testParseAttributeURLs() throws Exception {
final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"\">"
+ "<html xmlns=\"\">" + "<head>"
+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+ "<title>XHTML attributes URLs test</title>" + "</head>" + "<body>"
+ "Here are YaCy<a href=\"\">home page</a> and <a href=\"\">International Forum</a>."
+ "And this is a relative link to a <a href=\"/document.html\">sub document</a>." + "</body>"
+ "</html>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(;
final String contentTypeHeader = "text/xhtml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertEquals(3, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("")));
assertTrue(detectedAnchors.contains(new AnchorURL("")));
assertTrue(detectedAnchors.contains(new AnchorURL("")));
} finally {
* Test absolute URLs detection in XML elements text.
* @throws Exception
* when an unexpected error occurred
public void testParseContentURLs() throws Exception {
final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"\">"
+ "<html xmlns=\"\">" + "<head>"
+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+ "<title>XHTML content URLs test</title>" + "</head>" + "<body>" + "Here are some YaCy links:" + "<dl>"
+ "<dt>Home page</dt>" + "<dd></dd>" + "<dt>International Forum</dt>"
+ "<dd></dd>" + "</dl>"
+ "And this is a mention to a relative link : /document.html " + "</body>" + "</html>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(;
final String contentTypeHeader = "text/xhtml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertEquals(3, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("")));
assertTrue(detectedAnchors.contains(new AnchorURL("")));
assertTrue(detectedAnchors.contains(new AnchorURL("")));
} finally {
* Test parsing well-formed XML fragment (no XML declaration, no DTD or schema)
* @throws Exception when an unexpected error occurred
public void testParseXMLFragment() throws Exception {
final String xhtml = "<root><node><subNode1>Node content1</subNode1><subNode2>Node content2</subNode2></node></root>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(;
final String contentTypeHeader = "text/xml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
assertEquals("Node content1 Node content2", documents[0].getTextString());
} finally {

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<rdf:RDF xmlns:rdf=""
<dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns:rdf=""
<dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>