diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java
index 0f0bda721..6c3ed1efd 100644
--- a/source/net/yacy/document/parser/html/CharacterCoding.java
+++ b/source/net/yacy/document/parser/html/CharacterCoding.java
@@ -27,182 +27,227 @@ package net.yacy.document.parser.html;
import java.util.HashMap;
import java.util.Map;
-public class CharacterCoding {
+ * Contains methods to convert between Unicode and XML/HTML encoding.
+ */
+public final class CharacterCoding {
- private static final char amp_unicode = "\u0026".charAt(0);
- private static final String amp_html = "&";
- private static final String space_html = " ";
- private static final String[] mapping4xml = {
- "\"",""", //quotation mark
- "\u003C","<", //less than
- "\u003E",">", //greater than
+ /** Ampersand character in unicode encoding. */
+ private static final char AMP_UNICODE = "\u0026".charAt(0);
+ /** Ampersand character in HTML encoding. */
+ private static final String AMP_HTML = "&";
+ /** Space character in HTML encoding. */
+ private static final String SPACE_HTML = " ";
+ /** Special characters which have to be mapped for XML. */
+ private static final String[] MAPPING4XML = {
+ "\"", """, //quotation mark
+ "\u003C", "<", //less than
+ "\u003E", ">", //greater than
- private static final String[] mapping4html = {
- "\\", "\", // Backslash
- "\u005E","^", // Caret
- "\u0060","`", // Accent Grave `
- "\u007B","{", // {
- "\u007C","|", // |
- "\u007D","}", // }
- "\u007E","~", // ~
+ /** Special characters which have to be mapped for HTML. */
+ private static final String[] MAPPING4HTML = {
+ "\\", "\", // Backslash
+ "\u005E", "^", // Caret
+ "\u0060", "`", // Accent Grave `
+ "\u007B", "{", // {
+ "\u007C", "|", // |
+ "\u007D", "}", // }
+ "\u007E", "~", // ~
- "\u0082","",
- "\u0083","",
- "\u0084","",
- "\u0085","
- "\u0086","",
- "\u0087","",
- "\u0088","",
- "\u0089","",
- "\u008A","",
- "\u008B","",
- "\u008C","",
- "\u008D","",
- "\u008E","",
+ "\u0082", "",
+ "\u0083", "",
+ "\u0084", "",
+ "\u0085", "
+ "\u0086", "",
+ "\u0087", "",
+ "\u0088", "",
+ "\u0089", "",
+ "\u008A", "",
+ "\u008B", "",
+ "\u008C", "",
+ "\u008D", "",
+ "\u008E", "",
- "\u0091","",
- "\u0092","",
- "\u0093","",
- "\u0094","",
- "\u0095","",
- "\u0096","",
- "\u0097","",
- "\u0098","",
- "\u0099","",
- "\u009A","",
- "\u009B","",
- "\u009C","",
- "\u009D","",
- "\u009E","",
- "\u009F","",
+ "\u0091", "",
+ "\u0092", "",
+ "\u0093", "",
+ "\u0094", "",
+ "\u0095", "",
+ "\u0096", "",
+ "\u0097", "",
+ "\u0098", "",
+ "\u0099", "",
+ "\u009A", "",
+ "\u009B", "",
+ "\u009C", "",
+ "\u009D", "",
+ "\u009E", "",
+ "\u009F", "",
- "\u00A1","¡", //inverted (spanish) exclamation mark
- "\u00A2","¢", //cent
- "\u00A3","£", //pound
- "\u00A4","¤", //currency
- "\u00A5","¥", //yen
- "\u00A6","¦", //broken vertical bar
- "\u00A7","§", //section sign
- "\u00A8","¨", //diaeresis (umlaut)
- "\u00A9","©", //copyright sign
- "\u00AA","ª", //feminine ordinal indicator
- "\u00AB","«", //left-pointing double angle quotation mark
- "\u00AC","¬", //not sign
- "\u00AD","", //soft hyphen
- "\u00AE","®", //registered sign
- "\u00AF","¯", //macron
- "\u00B0","°", //degree sign
- "\u00B1","±", //plus-minus sign
- "\u00B2","²", //superscript two
- "\u00B3","³", //superscript three
- "\u00B4","´", //acute accent
- "\u00B5","µ", //micro sign
- "\u00B6","¶", //paragraph sign
- "\u00B7","·", //middle dot
- "\u00B8","¸", //cedilla
- "\u00B9","¹", //superscript one
- "\u00BA","º", //masculine ordinal indicator
- "\u00BB","»", //right-pointing double angle quotation mark
- "\u00BC","¼", //fraction 1/4
- "\u00BD","½", //fraction 1/2
- "\u00BE","¾", //fraction 3/4
- "\u00BF","¿", //inverted (spanisch) questionmark
- "\u00C0","À",
- "\u00C1","Á",
- "\u00C2","Â",
- "\u00C3","Ã",
- "\u00C4","Ä",
- "\u00C5","Å",
- "\u00C6","Æ",
- "\u00C7","Ç",
- "\u00C8","È",
- "\u00C9","É",
- "\u00CA","Ê",
- "\u00CB","Ë",
- "\u00CC","Ì",
- "\u00CD","Í",
- "\u00CE","Î",
- "\u00CF","Ï",
- "\u00D0","Ð",
- "\u00D1","Ñ",
- "\u00D2","Ò",
- "\u00D3","Ó",
- "\u00D4","Ô",
- "\u00D5","Õ",
- "\u00D6","Ö",
- "\u00D7","×",
- "\u00D8","Ø",
- "\u00D9","Ù",
- "\u00DA","Ú",
- "\u00DB","Û",
- "\u00DC","Ü",
- "\u00DD","Ý",
- "\u00DE","Þ",
- "\u00DF","ß",
- "\u00E0","à",
- "\u00E1","á",
- "\u00E2","â",
- "\u00E3","ã",
- "\u00E4","ä",
- "\u00E5","å",
- "\u00E6","æ",
- "\u00E7","ç",
- "\u00E8","è",
- "\u00E9","é",
- "\u00EA","ê",
- "\u00EB","ë",
- "\u00EC","ì",
- "\u00ED","í",
- "\u00EE","î",
- "\u00EF","ï",
- "\u00F0","ð",
- "\u00F1","ñ",
- "\u00F2","ò",
- "\u00F3","ó",
- "\u00F4","ô",
- "\u00F5","õ",
- "\u00F6","ö",
- "\u00F7","÷",
- "\u00F8","ø",
- "\u00F9","ù",
- "\u00FA","ú",
- "\u00FB","û",
- "\u00FC","ü",
- "\u00FD","ý",
- "\u00FE","þ",
- "\u00FF","ÿ"
+ "\u00A1", "¡", //inverted (spanish) exclamation mark
+ "\u00A2", "¢", //cent
+ "\u00A3", "£", //pound
+ "\u00A4", "¤", //currency
+ "\u00A5", "¥", //yen
+ "\u00A6", "¦", //broken vertical bar
+ "\u00A7", "§", //section sign
+ "\u00A8", "¨", //diaeresis (umlaut)
+ "\u00A9", "©", //copyright sign
+ "\u00AA", "ª", //feminine ordinal indicator
+ "\u00AB", "«", //left-pointing double angle quotation mark
+ "\u00AC", "¬", //not sign
+ "\u00AD", "", //soft hyphen
+ "\u00AE", "®", //registered sign
+ "\u00AF", "¯", //macron
+ "\u00B0", "°", //degree sign
+ "\u00B1", "±", //plus-minus sign
+ "\u00B2", "²", //superscript two
+ "\u00B3", "³", //superscript three
+ "\u00B4", "´", //acute accent
+ "\u00B5", "µ", //micro sign
+ "\u00B6", "¶", //paragraph sign
+ "\u00B7", "·", //middle dot
+ "\u00B8", "¸", //cedilla
+ "\u00B9", "¹", //superscript one
+ "\u00BA", "º", //masculine ordinal indicator
+ "\u00BB", "»", //right-pointing double angle quotation mark
+ "\u00BC", "¼", //fraction 1/4
+ "\u00BD", "½", //fraction 1/2
+ "\u00BE", "¾", //fraction 3/4
+ "\u00BF", "¿", //inverted (spanisch) questionmark
+ "\u00C0", "À",
+ "\u00C1", "Á",
+ "\u00C2", "Â",
+ "\u00C3", "Ã",
+ "\u00C4", "Ä",
+ "\u00C5", "Å",
+ "\u00C6", "Æ",
+ "\u00C7", "Ç",
+ "\u00C8", "È",
+ "\u00C9", "É",
+ "\u00CA", "Ê",
+ "\u00CB", "Ë",
+ "\u00CC", "Ì",
+ "\u00CD", "Í",
+ "\u00CE", "Î",
+ "\u00CF", "Ï",
+ "\u00D0", "Ð",
+ "\u00D1", "Ñ",
+ "\u00D2", "Ò",
+ "\u00D3", "Ó",
+ "\u00D4", "Ô",
+ "\u00D5", "Õ",
+ "\u00D6", "Ö",
+ "\u00D7", "×",
+ "\u00D8", "Ø",
+ "\u00D9", "Ù",
+ "\u00DA", "Ú",
+ "\u00DB", "Û",
+ "\u00DC", "Ü",
+ "\u00DD", "Ý",
+ "\u00DE", "Þ",
+ "\u00DF", "ß",
+ "\u00E0", "à",
+ "\u00E1", "á",
+ "\u00E2", "â",
+ "\u00E3", "ã",
+ "\u00E4", "ä",
+ "\u00E5", "å",
+ "\u00E6", "æ",
+ "\u00E7", "ç",
+ "\u00E8", "è",
+ "\u00E9", "é",
+ "\u00EA", "ê",
+ "\u00EB", "ë",
+ "\u00EC", "ì",
+ "\u00ED", "í",
+ "\u00EE", "î",
+ "\u00EF", "ï",
+ "\u00F0", "ð",
+ "\u00F1", "ñ",
+ "\u00F2", "ò",
+ "\u00F3", "ó",
+ "\u00F4", "ô",
+ "\u00F5", "õ",
+ "\u00F6", "ö",
+ "\u00F7", "÷",
+ "\u00F8", "ø",
+ "\u00F9", "ù",
+ "\u00FA", "ú",
+ "\u00FB", "û",
+ "\u00FC", "ü",
+ "\u00FD", "ý",
+ "\u00FE", "þ",
+ "\u00FF", "ÿ"
- private final static Map html2unicode4xml = new HashMap(mapping4xml.length * 2);
- private final static Map html2unicode4html = new HashMap(mapping4html.length * 2);
- private final static Map unicode2html4xml = new HashMap(mapping4xml.length * 2);
- private final static Map unicode2html4html = new HashMap(mapping4html.length * 2);
+ /** Mapping for XML to unicode. */
+ private static final Map HTML2UNICODE4XML =
+ new HashMap(MAPPING4XML.length * 2);
+ /** Mapping for HTML to unicode. */
+ private static final Map HTML2UNICODE4HTML =
+ new HashMap(MAPPING4HTML.length * 2);
+ /** Mapping for unicode to XML. */
+ private static final Map UNICODE2HTML4XML =
+ new HashMap(MAPPING4XML.length * 2);
+ /** Mapping for unicode to HTML. */
+ private static final Map UNICODE2HTML4HTML =
+ new HashMap(MAPPING4HTML.length * 2);
static {
Character c;
- for (int i = 0; i < mapping4html.length; i += 2) {
- c = Character.valueOf(mapping4html[i].charAt(0));
- html2unicode4html.put(mapping4html[i + 1], c);
- unicode2html4html.put(c, mapping4html[i + 1]);
+ for (int i = 0; i < MAPPING4HTML.length; i += 2) {
+ c = Character.valueOf(MAPPING4HTML[i].charAt(0));
- for (int i = 0; i < mapping4xml.length; i += 2) {
- c = Character.valueOf(mapping4xml[i].charAt(0));
- html2unicode4xml.put(mapping4xml[i + 1], c);
- unicode2html4xml.put(c, mapping4xml[i + 1]);
+ for (int i = 0; i < MAPPING4XML.length; i += 2) {
+ c = Character.valueOf(MAPPING4XML[i].charAt(0));
- public static String unicode2xml(final String text, boolean amp) {
+ /** Private constructor to avoid instantiation of utility
+ * class with only static methods.
+ */
+ private CharacterCoding() { }
+ /**
+ * Replaces characters which have special representation in XML.
+ * @see #MAPPING4XML
+ * @param text text with character to replace
+ * @param amp true if ampersands shall be replaced, else false
+ * @return text with replaced characters
+ */
+ public static String unicode2xml(final String text, final boolean amp) {
return unicode2html(text, amp, false);
- public static String unicode2html(final String text, boolean amp) {
+ /**
+ * Replaces characters which have special representation in HTML.
+ * @see #MAPPING4HTML
+ * @param text text with character to replace
+ * @param amp true if ampersands shall be replaced, else false
+ * @return text with replaced characters
+ */
+ public static String unicode2html(final String text, final boolean amp) {
return unicode2html(text, amp, true);
- private static String unicode2html(final String text, boolean amp, boolean html) {
+ /**
+ * Replaces characters which have special representation in HTML or XML.
+ * @param text text with character to replace
+ * @param amp true if ampersands shall be replaced, else false
+ * @param html true if characters shall be replaced for embedding in
+ * HTML, false for XML (far more characters are replaced for HTML,
+ * compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
+ * @return text with replaced characters
+ */
+ private static String unicode2html(
+ final String text, final boolean amp, final boolean html) {
if (text == null) return null;
final StringBuilder sb = new StringBuilder(text.length() * 12 / 10);
int textpos = 0;
@@ -211,17 +256,17 @@ public class CharacterCoding {
while (textpos < text.length()) {
// find a (forward) mapping
c = text.charAt(textpos);
- if (amp && c == amp_unicode) {
- sb.append(amp_html);
+ if (amp && c == AMP_UNICODE) {
+ sb.append(AMP_HTML);
- if ((r = unicode2html4xml.get(c)) != null) {
+ if ((r = UNICODE2HTML4XML.get(c)) != null) {
- if (html && (r = unicode2html4html.get(c)) != null) {
+ if (html && (r = UNICODE2HTML4HTML.get(c)) != null) {
@@ -231,7 +276,12 @@ public class CharacterCoding {
return sb.toString();
+ /**
+ * Replaces HTML-encoded characters with unicode representation.
+ * @param text text with character to replace
+ * @return text with replaced characters
+ */
public static String html2unicode(final String text) {
if (text == null) return null;
int p = 0, p1, q;
@@ -246,7 +296,9 @@ public class CharacterCoding {
sb.append(text, p, p1);
p = p1;
- if (p >= text.length()) break;
+ if (p >= text.length()) {
+ break;
+ }
q = text.indexOf(';', p);
if (q < 0) {
// if there is now no semicolon, then this will also fail when another ampersand is found afterwards
@@ -256,19 +308,19 @@ public class CharacterCoding {
s = text.substring(p, q + 1);
p = q + 1;
- if (s.equals(amp_html)) {
- sb.append(amp_unicode);
+ if (s.equals(AMP_HTML)) {
+ sb.append(AMP_UNICODE);
- if (s.equals(space_html)) {
+ if (s.equals(SPACE_HTML)) {
sb.append(" ");
- if ((r = html2unicode4xml.get(s)) != null) {
+ if ((r = HTML2UNICODE4XML.get(s)) != null) {
- if ((r = html2unicode4html.get(s)) != null) {
+ if ((r = HTML2UNICODE4HTML.get(s)) != null) {
@@ -279,9 +331,9 @@ public class CharacterCoding {
String ucs = s.substring(2, s.length() - 1);
try {
- int uc = Integer.parseInt(ucs);
- sb.append(new char[] {(char) uc});
- } catch (NumberFormatException e) {}
+ int uc = Integer.parseInt(ucs);
+ sb.append(new char[] {(char) uc});
+ } catch (NumberFormatException e) { }
// the entity is unknown, skip it
@@ -289,13 +341,20 @@ public class CharacterCoding {
return sb.toString();
+ /**
+ * Test method. Ignore it if you don't need it.
+ * @param args will be ignored
+ */
public static void main(final String[] args) {
- final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
+ final String text =
+ "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
final String txet = unicode2html(text, true);
- if (html2unicode(txet).equals(text)) System.out.println("correct");
+ if (html2unicode(txet).equals(text)) {
+ System.out.println("correct");
+ }
final String text2 = "encodeUnicode2xml: & \" < >";
System.out.println(unicode2xml(text2, true));