Added a generic XML parser, able to parse elements text and URLs.

This parser adds support for any XML based format other than already
supported XML vocabularies such XHTML, RSS/Atom feeds... It will
eventually be used as a fallback if one of these specific parsers fail,
before falling back to the existing genericParser which extracts not
that much useful information except URL tokens.
pull/127/head
luccioman 8 years ago
parent aeeb8a7dd5
commit 319231a458

@ -37,6 +37,7 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.parser.GenericXMLParser;
import net.yacy.document.parser.apkParser;
import net.yacy.document.parser.audioTagParser;
import net.yacy.document.parser.bzipParser;
@ -73,6 +74,10 @@ public final class TextParser {
private static final Object v = new Object();
private static final Parser genericIdiom = new genericParser();
/** A generic XML parser instance */
private static final Parser genericXMLIdiom = new GenericXMLParser();
//use LinkedHashSet for parser collection to use (init) order to prefered parser for same ext or mime
private static final Map<String, LinkedHashSet<Parser>> mime2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
private static final ConcurrentHashMap<String, LinkedHashSet<Parser>> ext2parser = new ConcurrentHashMap<String, LinkedHashSet<Parser>>();
@ -112,7 +117,9 @@ public final class TextParser {
initParser(new xlsParser());
initParser(new zipParser());
initParser(new audioTagParser());
/* Order is important : the generic XML parser must be initialized in last, so it will be effectively used only as a fallback one
* when a specialized parser exists for any XML based format (examples : rssParser or ooxmlParser must be tried first) */
initParser(genericXMLIdiom);
}
public static Set<Parser> parsers() {
@ -426,7 +433,7 @@ public final class TextParser {
if (idiom != null) idioms.addAll(idiom);
}
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was suppied)
// check extension and add as backup (in case no, wrong or unknown/unsupported mime was supplied)
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext != null && ext.length() > 0) {
if (denyExtensionx.containsKey(ext)) throw new Parser.Failure("file extension '" + ext + "' is denied (1)", url);
@ -441,6 +448,12 @@ public final class TextParser {
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.containsAll(idiom)) { // use containsAll -> idiom is a Set of parser
idioms.addAll(idiom);
}
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix so we can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
if(idioms.isEmpty() && mimeType1 != null && mimeType1.endsWith("+xml")) {
idioms.add(genericXMLIdiom);
}
// always add the generic parser (make sure it is the last in access order)
idioms.add(genericIdiom);
@ -456,10 +469,20 @@ public final class TextParser {
* @return an error if the mime type is not supported, null otherwise
*/
public static String supportsMime(String mimeType) {
if (mimeType == null) return null;
if (mimeType == null) {
return null;
}
mimeType = normalizeMimeType(mimeType);
if (denyMime.containsKey(mimeType)) return "mime type '" + mimeType + "' is denied (2)";
if (mime2parser.get(mimeType) == null) return "no parser for mime '" + mimeType + "' available";
if (denyMime.containsKey(mimeType)) {
return "mime type '" + mimeType + "' is denied (2)";
}
if (mime2parser.get(mimeType) == null) {
/* No matching idiom has been found : let's check if the media type ends with the "+xml" suffix as can handle it with a generic XML parser
* (see RFC 7303 - Using '+xml' when Registering XML-Based Media Types : https://tools.ietf.org/html/rfc7303#section-4.2) */
if(!mimeType.endsWith("+xml")) {
return "no parser for mime '" + mimeType + "' available";
}
}
return null;
}

@ -0,0 +1,144 @@
// GenericXMLParser.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.input.XmlStreamReader;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.xml.GenericXMLContentHandler;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
/**
* A generic XML parser without knowledge of the specific XML vocabulary.
* @author luccioman
*
*/
public class GenericXMLParser extends AbstractParser implements Parser {
/** SAX parser instance local to each thread */
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
/**
* @return a SAXParser instance for the current thread
* @throws SAXException when an error prevented parser creation
*/
private static SAXParser getParser() throws SAXException {
SAXParser parser = tlSax.get();
if (parser == null) {
try {
parser = SAXParserFactory.newInstance().newSAXParser();
} catch (final ParserConfigurationException e) {
throw new SAXException(e.getMessage(), e);
}
tlSax.set(parser);
}
return parser;
}
public GenericXMLParser() {
super("XML Parser");
this.SUPPORTED_EXTENSIONS.add("xml");
this.SUPPORTED_MIME_TYPES.add("application/xml");
this.SUPPORTED_MIME_TYPES.add("text/xml");
}
@Override
public Document[] parse(
final DigestURL location,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source)
throws Failure, InterruptedException {
/* Limit the size of the in-memory buffer to at most 25% of the available memory :
* because some room is needed, and before being garbage collected the buffer will be converted to a String, then to a byte array.
* Eventual stricter limits should be handled by the caller (see for example crawler.[protocol].maxFileSize configuration setting). */
final long availableMemory = MemoryControl.available();
final long maxBytes = (long)(availableMemory * 0.25);
final int maxChars;
if((maxBytes / Character.BYTES) > Integer.MAX_VALUE) {
maxChars = Integer.MAX_VALUE;
} else {
maxChars = ((int)maxBytes) / Character.BYTES;
}
try (/* Automatically closed by this try-with-resources statement*/ CharBuffer writer = new CharBuffer(maxChars);){
/* Use commons-io XmlStreamReader advanced rules to help with charset detection when source contains no BOM or XML declaration
* (detection algorithm notably also include ContentType transmitted by HTTP headers, here eventually present as mimeType and charset parameters), */
final XmlStreamReader reader = new XmlStreamReader(source, mimeType, true, charset);
final InputSource saxSource = new InputSource(reader);
final String detectedCharset = reader.getEncoding();
final List<AnchorURL> detectedURLs = new ArrayList<>();
final GenericXMLContentHandler saxHandler = new GenericXMLContentHandler(writer, detectedURLs);
final SAXParser saxParser = getParser();
saxParser.parse(saxSource, saxHandler);
if (writer.isOverflow()) {
throw new Parser.Failure("Not enough Memory available for generic the XML parser : "
+ Formatter.bytesToString(availableMemory), location);
}
/* create the parsed document */
Document[] docs = null;
final byte[] contentBytes = UTF8.getBytes(writer.toString());
docs = new Document[] { new Document(location, mimeType, detectedCharset, this, null, null, null, null, "",
null, null, 0.0d, 0.0d, contentBytes, detectedURLs, null, null, false, new Date()) };
return docs;
} catch (final Exception e) {
if (e instanceof InterruptedException) {
throw (InterruptedException) e;
}
if (e instanceof Parser.Failure) {
throw (Parser.Failure) e;
}
throw new Parser.Failure("Unexpected error while parsing XML file. " + e.getMessage(), location);
}
}
}

@ -344,24 +344,17 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ((b.length() != 0) && (!(SentenceReader.punctuation(b.charAt(b.length() - 1))))) b = b + '.';
//System.out.println("*** Appended dot: " + b.toString());
}
// find http links inside text
s = 0;
String u;
while (s < b.length()) {
p = find(b, dpssp, s);
if (p == Integer.MAX_VALUE) break;
s = Math.max(0, p - 5);
p = find(b, protp, s);
if (p == Integer.MAX_VALUE) break;
q = b.indexOf(" ", p + 1);
u = b.substring(p, q < 0 ? b.length() : q);
if (u.endsWith(".")) u = u.substring(0, u.length() - 1); // remove the '.' that was appended above
s = p + 6;
try {
this.addAnchor(new AnchorURL(u));
continue;
} catch (final MalformedURLException e) {}
// find absolute URLs inside text
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
List<ContentScraperListener> anchorListeners = new ArrayList<>();
for (int i = 0; i < listeners.length; i += 2) {
if (listeners[i] == ContentScraperListener.class) {
anchorListeners.add((ContentScraperListener)listeners[i+1]);
}
}
findAbsoluteURLs(b, this.anchors, anchorListeners);
// append string to content
if (!b.isEmpty()) {
this.content.append(b);
@ -371,6 +364,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final static Pattern dpssp = Pattern.compile("://");
private final static Pattern protp = Pattern.compile("smb://|ftp://|http://|https://");
/**
* Try to detect and parse absolute URLs in text, then update the urls collection and fire anchorAdded event on listeners. Any parameter are can be null.
* @param text the text to parse
* @param urls a mutable collection of URLs to fill.
* @param listeners a collection of listeners to trigger.
*/
public static void findAbsoluteURLs(final String text, final Collection<AnchorURL> urls, final Collection<ContentScraperListener> listeners) {
if(text == null) {
return;
}
int schemePosition, spacePosition, offset = 0;
String urlString;
AnchorURL url;
while (offset < text.length()) {
schemePosition = find(text, dpssp, offset);
if (schemePosition == Integer.MAX_VALUE) {
break;
}
offset = Math.max(0, schemePosition - 5);
schemePosition = find(text, protp, offset);
if (schemePosition == Integer.MAX_VALUE) {
break;
}
spacePosition = text.indexOf(" ", schemePosition + 1);
urlString = text.substring(schemePosition, spacePosition < 0 ? text.length() : spacePosition);
if (urlString.endsWith(".")) urlString = urlString.substring(0, urlString.length() - 1); // remove the '.' that was appended above
offset = schemePosition + 6;
try {
url = new AnchorURL(urlString);
if(urls != null) {
urls.add(url);
}
if(listeners != null) {
for(ContentScraperListener listener : listeners) {
listener.anchorAdded(url.toNormalform(false));
}
}
} catch (final MalformedURLException ignored) {}
}
}
private static final int find(final String s, final Pattern m, final int start) {
final Matcher mm = m.matcher(s.subSequence(start, s.length()));

@ -0,0 +1,162 @@
// GenericXMLContentHandler.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser.xml;
import java.io.IOException;
import java.io.Writer;
import java.util.Collection;
import org.apache.commons.io.input.ClosedInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.parser.html.ContentScraper;
/**
* SAX handler for XML contents, only extracting text and eventual URLs from
* XML.
*
* @author luccioman
*
*/
public class GenericXMLContentHandler extends DefaultHandler {
/** Output writer */
private final Writer out;
/** Detected URLs */
private final Collection<AnchorURL> urls;
/** Text of the currently parsed element. May not contain the whole text when the element has nested elements embedded in its own text */
private StringBuilder currentElementText;
/** Set to true when the last character written to the output writer is a space */
private boolean lastAppendedIsSpace;
/** The number of text chunks handled in the current element (reset to zero when the element has nested elements) */
private int currentElementTextChunks;
/** Set to false until some text is detected in at least one element of the document */
private boolean documentHasText;
/**
* @param out
* the output writer to write extracted text. Must not be null.
* @param urls the mutable collection of URLs to fill with detected URLs
* @throws IllegalArgumentException
* when out is null
*/
public GenericXMLContentHandler(final Writer out, final Collection<AnchorURL> urls) throws IllegalArgumentException {
if (out == null) {
throw new IllegalArgumentException("out writer must not be null");
}
if (urls == null) {
throw new IllegalArgumentException("urls collection must not be null");
}
this.out = out;
this.urls = urls;
}
/**
* @return an empty source to prevent the SAX parser opening an unwanted
* connection to resolve an external entity
*/
@Override
public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException {
return new InputSource(new ClosedInputStream());
}
@Override
public void startDocument() throws SAXException {
this.currentElementText = new StringBuilder();
this.lastAppendedIsSpace = false;
this.currentElementTextChunks = 0;
this.documentHasText = false;
}
/**
* Try to detect URLs eventually contained in attributes
*/
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
this.currentElementText.setLength(0);
this.currentElementTextChunks = 0;
if (attributes != null) {
for (int i = 0; i < attributes.getLength(); i++) {
String attribute = attributes.getValue(i);
ContentScraper.findAbsoluteURLs(attribute, this.urls, null);
}
}
}
/**
* Write characters to the output writer
*/
@Override
public void characters(final char ch[], final int start, final int length) {
try {
if(this.currentElementTextChunks == 0 && this.documentHasText) {
/* We are on the first text chunk of the element, or the first text chunk after processing nested elements :
* if necessary we add a space to separate text content of different elements */
if(length > 0 && !this.lastAppendedIsSpace && !Character.isWhitespace(ch[0])) {
this.out.write(" ");
this.currentElementText.append(" ");
}
}
this.out.write(ch, start, length);
this.currentElementText.append(ch, start, length);
if(length > 0) {
this.currentElementTextChunks++;
this.documentHasText = true;
this.lastAppendedIsSpace = Character.isWhitespace(ch[length - 1]);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
/**
* When the eventual element text doesn't end with a terminal punctuation character,
* add a period ('.' character) to help future SentenceReader work.
*/
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
ContentScraper.findAbsoluteURLs(this.currentElementText.toString(), urls, null);
this.currentElementText.setLength(0);
this.currentElementTextChunks = 0;
}
@Override
public void endDocument() throws SAXException {
/* Release the StringBuilder now useless */
this.currentElementText = null;
}
}

@ -43,12 +43,16 @@ public final class CharBuffer extends Writer {
private int offset;
private int length;
private final int maximumLength;
/** Set to true when write attempts beyond the maximumLength have been tried */
private boolean overflow;
public CharBuffer(final int maximumLength) {
this.buffer = new char[10];
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
}
public CharBuffer(final int maximumLength, final int initLength) {
@ -56,6 +60,7 @@ public final class CharBuffer extends Writer {
this.length = 0;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
}
public CharBuffer(final int maximumLength, final char[] bb) {
@ -63,6 +68,7 @@ public final class CharBuffer extends Writer {
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
}
public CharBuffer(final int maximumLength, final char[] bb, final int initLength) {
@ -71,6 +77,7 @@ public final class CharBuffer extends Writer {
this.length = bb.length;
this.offset = 0;
this.maximumLength = maximumLength;
this.overflow = false;
}
public CharBuffer(final File f) throws IOException {
@ -81,6 +88,7 @@ public final class CharBuffer extends Writer {
this.length = 0;
this.buffer = new char[(int) f.length()*2];
this.offset = 0;
this.overflow = false;
FileReader fr = null;
try {
@ -102,6 +110,7 @@ public final class CharBuffer extends Writer {
this.buffer = new char[0];
this.length = 0;
this.offset = 0;
this.overflow = false;
}
public int length() {
@ -111,6 +120,13 @@ public final class CharBuffer extends Writer {
public boolean isEmpty() {
return this.length == 0;
}
/**
* @return true when write attempts beyond the maximumLength have been tried
*/
public boolean isOverflow() {
return this.overflow;
}
private void grow(int minSize) {
int newsize = 12 * Math.max(this.buffer.length, minSize) / 10; // grow by 20%
@ -126,7 +142,10 @@ public final class CharBuffer extends Writer {
}
public void write(final char b) {
if (this.buffer.length > this.maximumLength) return;
if (this.buffer.length > this.maximumLength) {
this.overflow = true;
return;
}
if (this.offset + this.length + 1 > this.buffer.length) grow(this.offset + this.length + 1);
this.buffer[this.offset + this.length++] = b;
}
@ -138,7 +157,10 @@ public final class CharBuffer extends Writer {
@Override
public void write(final char[] bb, final int of, final int le) {
if (this.buffer.length > this.maximumLength) return;
if (this.buffer.length > this.maximumLength) {
this.overflow = true;
return;
}
if (this.offset + this.length + le > this.buffer.length) grow(this.offset + this.length + le);
System.arraycopy(bb, of, this.buffer, this.offset + this.length, le);
this.length += le;

@ -0,0 +1,362 @@
// GenericXMLParserTest.java
// ---------------------------
// Copyright 2017 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import org.junit.Before;
import org.junit.Test;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
/**
* Unit tests for the {@link GenericXMLParser} class
*
* @author luccioman
*
*/
public class GenericXMLParserTest {
/** Example test tag including non-ascii characters */
private static final String UMLAUT_TEXT_TAG = "<text>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</text>";
private GenericXMLParser parser;
@Before
public void setUp() {
this.parser = new GenericXMLParser();
}
/**
* Unit test for the GenericXMLParser.parse() function with some small XML
* test files.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParse() throws Exception {
final String[] fileNames = { "umlaute_dc_xml_iso.xml", "umlaute_dc_xml_utf8.xml" };
final File folder = new File("test" + File.separator + "parsertest" + File.separator);
for (String fileName : fileNames) {
FileInputStream inStream = new FileInputStream(new File(folder, fileName));
DigestURL location = new DigestURL("http://localhost/" + fileName);
try {
Document[] documents = this.parser.parse(location, "text/xml", null, new VocabularyScraper(), 0,
inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char in file " + fileName,
documents[0].getTextString().contains("Maßkrügen"));
} finally {
inStream.close();
}
}
}
/**
*
* @param parser
* generic xml parser instance. Must not be null.
* @param encodedXML
* xml encoded bytes to test
* @param contentTypeHeader
* Content-Type header value
* @param expectedCharset
* expected character set name to be detected
* @param expectedConntainedText
* expected text to be contained in the parsed text
* @throws Exception
* when an unexpected error occurred
*/
private void testCharsetDetection(final GenericXMLParser parser, final byte[] encodedXML,
final String contentTypeHeader, final String expectedCharset, final String expectedConntainedText)
throws Exception {
InputStream inStream = new ByteArrayInputStream(encodedXML);
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(expectedCharset, documents[0].getCharset());
assertNotNull(documents[0].getTextString());
assertTrue(documents[0].getTextString().contains(expectedConntainedText));
} finally {
inStream.close();
}
}
/**
* Test UTF-8 charset detection
*
* @see RFC 7303 "UTF-8 Charset" example
* (https://tools.ietf.org/html/rfc7303#section-8.1)
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseUTF8Charset() throws Exception {
/*
* UTF-8 charset provided both in Content-Type HTTP header and in XML
* declaration
*/
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-8\"?>" + UMLAUT_TEXT_TAG)
.getBytes(StandardCharsets.UTF_8);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(),
"Maßkrügen");
/*
* Charset provided in Content-Type HTTP header but omitted in XML
* declaration
*/
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-8", StandardCharsets.UTF_8.name(),
"Maßkrügen");
}
/**
* Test UTF-16 charset detection
*
* @see RFC 7303 "UTF-16 Charset" and
* "Omitted Charset and 16-Bit MIME Entity" examples
* (https://tools.ietf.org/html/rfc7303#section-8.2 and
* https://tools.ietf.org/html/rfc7303#section-8.4)
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseUTF16Charset() throws Exception {
/*
* UTF-16 charset provided both in Content-Type HTTP header and in XML
* declaration with BOM (Byte Order Mark)
*/
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
.getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16", StandardCharsets.UTF_16.name(),
"Maßkrügen");
/*
* UTF-16 charset provided in Content-Type HTTP header but omitted in
* XML declaration having only BOM (Byte Order Mark)
*/
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16",
StandardCharsets.UTF_16BE.name(), "Maßkrügen");
/*
* Charset is omitted in Content-Type HTTP header, but provided in the
* XML declaration with BOM (Byte Order Mark)
*/
encodedXML = ("<?xml version=\"1.0\" encoding=\"utf-16\"?>" + UMLAUT_TEXT_TAG)
.getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16.name(), "Maßkrügen");
/*
* Charset is omitted in both Content-Type HTTP header and XML
* declaration with BOM (Byte Order Mark)
*/
encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_16);
testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_16BE.name(), "Maßkrügen");
}
/**
* Test ISO-8859-1 charset detection
*
* @see RFC 7303 "Omitted Charset and 8-Bit MIME Entity" example
* (https://tools.ietf.org/html/rfc7303#section-8.3)
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseISO_8859_1Charset() throws Exception {
/*
* ISO-8859-1 charset provided only in XML declaration without BOM (Byte
* Order Mark)
*/
byte[] encodedXML = ("<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" + UMLAUT_TEXT_TAG)
.getBytes(StandardCharsets.ISO_8859_1);
testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.ISO_8859_1.name(),
"Maßkrügen");
}
/**
* Test charset detection when the character encoding is omitted in
* Content-Type header, and content has a XML declaration with no encoding
* declaration
*
* @see RFC 7303 "Omitted Charset, No Internal Encoding Declaration" example
* (https://tools.ietf.org/html/rfc7303#section-8.5)
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseOmittedCharsetNoInternalEncoding() throws Exception {
/*
* XML encoded as UTF-8 without BOM (Byte Order Mark)
*/
byte[] encodedXML = ("<?xml version=\"1.0\"?>" + UMLAUT_TEXT_TAG).getBytes(StandardCharsets.UTF_8);
testCharsetDetection(parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen");
/*
* XML encoded as ASCII, with non ascii chars encoded as entities
*/
encodedXML = ("<?xml version=\"1.0\"?>"
+ "<text>In M&#x000FC;nchen steht ein Hofbr&#x000E4;uhaus, dort gibt es Bier in Ma&#x000DF;kr&#x000FC;gen</text>")
.getBytes(StandardCharsets.US_ASCII);
testCharsetDetection(this.parser, encodedXML, "application/xml", StandardCharsets.UTF_8.name(), "Maßkrügen");
}
/**
* Test UTF-16BE charset detection
*
* @see RFC 7303 "UTF-16BE Charset" example
* (https://tools.ietf.org/html/rfc7303#section-8.6)
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseUTF8_16BECharset() throws Exception {
/*
* UTF-16BE charset provided both in Content-Type HTTP header and in XML
* declaration, without BOM (Byte Order Mark)
*/
byte[] encodedXML = ("<?xml version='1.0' encoding='utf-16be'?>" + UMLAUT_TEXT_TAG)
.getBytes(StandardCharsets.UTF_16BE);
testCharsetDetection(this.parser, encodedXML, "application/xml; charset=utf-16be",
StandardCharsets.UTF_16BE.name(), "Maßkrügen");
}
/**
* Test absolute URLs detection in XML elements attributes.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseAttributeURLs() throws Exception {
final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+ "<title>XHTML attributes URLs test</title>" + "</head>" + "<body>"
+ "Here are YaCy<a href=\"http://yacy.net\">home page</a> and <a href=\"http://forum.yacy.de\">International Forum</a>."
+ "And this is a relative link to a <a href=\"/document.html\">sub document</a>." + "</body>"
+ "</html>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
final String contentTypeHeader = "text/xhtml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(3, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
} finally {
inStream.close();
}
}
/**
* Test absolute URLs detection in XML elements text.
*
* @throws Exception
* when an unexpected error occurred
*/
@Test
public void testParseContentURLs() throws Exception {
final String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">"
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">" + "<head>"
+ "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\" />"
+ "<title>XHTML content URLs test</title>" + "</head>" + "<body>" + "Here are some YaCy links:" + "<dl>"
+ "<dt>Home page</dt>" + "<dd>http://yacy.net</dd>" + "<dt>International Forum</dt>"
+ "<dd>http://forum.yacy.de</dd>" + "</dl>"
+ "And this is a mention to a relative link : /document.html " + "</body>" + "</html>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
final String contentTypeHeader = "text/xhtml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
Collection<AnchorURL> detectedAnchors = documents[0].getAnchors();
assertNotNull(detectedAnchors);
assertEquals(3, detectedAnchors.size());
assertTrue(detectedAnchors.contains(new AnchorURL("http://www.w3.org/1999/xhtml")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://yacy.net")));
assertTrue(detectedAnchors.contains(new AnchorURL("http://forum.yacy.de")));
} finally {
inStream.close();
}
}
/**
* Test parsing well-formed XML fragment (no XML declaration, no DTD or schema)
* @throws Exception when an unexpected error occurred
*/
@Test
public void testParseXMLFragment() throws Exception {
final String xhtml = "<root><node><subNode1>Node content1</subNode1><subNode2>Node content2</subNode2></node></root>";
InputStream inStream = new ByteArrayInputStream(xhtml.getBytes(StandardCharsets.UTF_8.name()));
final String contentTypeHeader = "text/xml";
String charsetFromHttpHeader = HeaderFramework.getCharacterEncoding(contentTypeHeader);
DigestURL location = new DigestURL("http://localhost/testfile.xml");
try {
Document[] documents = this.parser.parse(location, contentTypeHeader, charsetFromHttpHeader,
new VocabularyScraper(), 0, inStream);
assertEquals(1, documents.length);
assertEquals("Node content1 Node content2", documents[0].getTextString());
} finally {
inStream.close();
}
}
}

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE rdf:RDF SYSTEM "http://dublincore.org/2000/12/01-dcmes-xml-dtd.dtd">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description>
<dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
</rdf:Description>
</rdf:RDF>

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE rdf:RDF SYSTEM "http://dublincore.org/2000/12/01-dcmes-xml-dtd.dtd">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:dc="http://purl.org/dc/elements/1.1/">
<rdf:Description>
<dc:description>In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen</dc:description>
</rdf:Description>
</rdf:RDF>
Loading…
Cancel
Save