|
|
|
@ -27,182 +27,227 @@ package net.yacy.document.parser.html;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
|
|
|
|
|
public class CharacterCoding {
|
|
|
|
|
/**
|
|
|
|
|
* Contains methods to convert between Unicode and XML/HTML encoding.
|
|
|
|
|
*/
|
|
|
|
|
public final class CharacterCoding {
|
|
|
|
|
|
|
|
|
|
private static final char amp_unicode = "\u0026".charAt(0);
|
|
|
|
|
private static final String amp_html = "&";
|
|
|
|
|
private static final String space_html = " ";
|
|
|
|
|
/** Ampersand character in unicode encoding. */
|
|
|
|
|
private static final char AMP_UNICODE = "\u0026".charAt(0);
|
|
|
|
|
/** Ampersand character in HTML encoding. */
|
|
|
|
|
private static final String AMP_HTML = "&";
|
|
|
|
|
/** Space character in HTML encoding. */
|
|
|
|
|
private static final String SPACE_HTML = " ";
|
|
|
|
|
|
|
|
|
|
private static final String[] mapping4xml = {
|
|
|
|
|
"\"",""", //quotation mark
|
|
|
|
|
"\u003C","<", //less than
|
|
|
|
|
"\u003E",">", //greater than
|
|
|
|
|
/** Special characters which have to be mapped for XML. */
|
|
|
|
|
private static final String[] MAPPING4XML = {
|
|
|
|
|
"\"", """, //quotation mark
|
|
|
|
|
"\u003C", "<", //less than
|
|
|
|
|
"\u003E", ">", //greater than
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
private static final String[] mapping4html = {
|
|
|
|
|
/** Special characters which have to be mapped for HTML. */
|
|
|
|
|
private static final String[] MAPPING4HTML = {
|
|
|
|
|
"\\", "\", // Backslash
|
|
|
|
|
"\u005E","^", // Caret
|
|
|
|
|
"\u005E", "^", // Caret
|
|
|
|
|
|
|
|
|
|
"\u0060","`", // Accent Grave `
|
|
|
|
|
"\u007B","{", // {
|
|
|
|
|
"\u007C","|", // |
|
|
|
|
|
"\u007D","}", // }
|
|
|
|
|
"\u007E","~", // ~
|
|
|
|
|
"\u0060", "`", // Accent Grave `
|
|
|
|
|
"\u007B", "{", // {
|
|
|
|
|
"\u007C", "|", // |
|
|
|
|
|
"\u007D", "}", // }
|
|
|
|
|
"\u007E", "~", // ~
|
|
|
|
|
|
|
|
|
|
"\u0082","‚",
|
|
|
|
|
"\u0083","ƒ",
|
|
|
|
|
"\u0084","„",
|
|
|
|
|
"\u0085","…",
|
|
|
|
|
"\u0086","†",
|
|
|
|
|
"\u0087","‡",
|
|
|
|
|
"\u0088","ˆ",
|
|
|
|
|
"\u0089","‰",
|
|
|
|
|
"\u008A","Š",
|
|
|
|
|
"\u008B","‹",
|
|
|
|
|
"\u008C","Œ",
|
|
|
|
|
"\u008D","",
|
|
|
|
|
"\u008E","Ž",
|
|
|
|
|
"\u0082", "‚",
|
|
|
|
|
"\u0083", "ƒ",
|
|
|
|
|
"\u0084", "„",
|
|
|
|
|
"\u0085", "…",
|
|
|
|
|
"\u0086", "†",
|
|
|
|
|
"\u0087", "‡",
|
|
|
|
|
"\u0088", "ˆ",
|
|
|
|
|
"\u0089", "‰",
|
|
|
|
|
"\u008A", "Š",
|
|
|
|
|
"\u008B", "‹",
|
|
|
|
|
"\u008C", "Œ",
|
|
|
|
|
"\u008D", "",
|
|
|
|
|
"\u008E", "Ž",
|
|
|
|
|
|
|
|
|
|
"\u0091","‘",
|
|
|
|
|
"\u0092","’",
|
|
|
|
|
"\u0093","“",
|
|
|
|
|
"\u0094","”",
|
|
|
|
|
"\u0095","•",
|
|
|
|
|
"\u0096","–",
|
|
|
|
|
"\u0097","—",
|
|
|
|
|
"\u0098","˜",
|
|
|
|
|
"\u0099","™",
|
|
|
|
|
"\u009A","š",
|
|
|
|
|
"\u009B","›",
|
|
|
|
|
"\u009C","œ",
|
|
|
|
|
"\u009D","",
|
|
|
|
|
"\u009E","ž",
|
|
|
|
|
"\u009F","Ÿ",
|
|
|
|
|
"\u0091", "‘",
|
|
|
|
|
"\u0092", "’",
|
|
|
|
|
"\u0093", "“",
|
|
|
|
|
"\u0094", "”",
|
|
|
|
|
"\u0095", "•",
|
|
|
|
|
"\u0096", "–",
|
|
|
|
|
"\u0097", "—",
|
|
|
|
|
"\u0098", "˜",
|
|
|
|
|
"\u0099", "™",
|
|
|
|
|
"\u009A", "š",
|
|
|
|
|
"\u009B", "›",
|
|
|
|
|
"\u009C", "œ",
|
|
|
|
|
"\u009D", "",
|
|
|
|
|
"\u009E", "ž",
|
|
|
|
|
"\u009F", "Ÿ",
|
|
|
|
|
|
|
|
|
|
"\u00A1","¡", //inverted (spanish) exclamation mark
|
|
|
|
|
"\u00A2","¢", //cent
|
|
|
|
|
"\u00A3","£", //pound
|
|
|
|
|
"\u00A4","¤", //currency
|
|
|
|
|
"\u00A5","¥", //yen
|
|
|
|
|
"\u00A6","¦", //broken vertical bar
|
|
|
|
|
"\u00A7","§", //section sign
|
|
|
|
|
"\u00A8","¨", //diaeresis (umlaut)
|
|
|
|
|
"\u00A9","©", //copyright sign
|
|
|
|
|
"\u00AA","ª", //feminine ordinal indicator
|
|
|
|
|
"\u00AB","«", //left-pointing double angle quotation mark
|
|
|
|
|
"\u00AC","¬", //not sign
|
|
|
|
|
"\u00AD","­", //soft hyphen
|
|
|
|
|
"\u00AE","®", //registered sign
|
|
|
|
|
"\u00AF","¯", //macron
|
|
|
|
|
"\u00B0","°", //degree sign
|
|
|
|
|
"\u00B1","±", //plus-minus sign
|
|
|
|
|
"\u00B2","²", //superscript two
|
|
|
|
|
"\u00B3","³", //superscript three
|
|
|
|
|
"\u00B4","´", //acute accent
|
|
|
|
|
"\u00B5","µ", //micro sign
|
|
|
|
|
"\u00B6","¶", //paragraph sign
|
|
|
|
|
"\u00B7","·", //middle dot
|
|
|
|
|
"\u00B8","¸", //cedilla
|
|
|
|
|
"\u00B9","¹", //superscript one
|
|
|
|
|
"\u00BA","º", //masculine ordinal indicator
|
|
|
|
|
"\u00BB","»", //right-pointing double angle quotation mark
|
|
|
|
|
"\u00BC","¼", //fraction 1/4
|
|
|
|
|
"\u00BD","½", //fraction 1/2
|
|
|
|
|
"\u00BE","¾", //fraction 3/4
|
|
|
|
|
"\u00BF","¿", //inverted (spanisch) questionmark
|
|
|
|
|
"\u00C0","À",
|
|
|
|
|
"\u00C1","Á",
|
|
|
|
|
"\u00C2","Â",
|
|
|
|
|
"\u00C3","Ã",
|
|
|
|
|
"\u00C4","Ä",
|
|
|
|
|
"\u00C5","Å",
|
|
|
|
|
"\u00C6","Æ",
|
|
|
|
|
"\u00C7","Ç",
|
|
|
|
|
"\u00C8","È",
|
|
|
|
|
"\u00C9","É",
|
|
|
|
|
"\u00CA","Ê",
|
|
|
|
|
"\u00CB","Ë",
|
|
|
|
|
"\u00CC","Ì",
|
|
|
|
|
"\u00CD","Í",
|
|
|
|
|
"\u00CE","Î",
|
|
|
|
|
"\u00CF","Ï",
|
|
|
|
|
"\u00D0","Ð",
|
|
|
|
|
"\u00D1","Ñ",
|
|
|
|
|
"\u00D2","Ò",
|
|
|
|
|
"\u00D3","Ó",
|
|
|
|
|
"\u00D4","Ô",
|
|
|
|
|
"\u00D5","Õ",
|
|
|
|
|
"\u00D6","Ö",
|
|
|
|
|
"\u00D7","×",
|
|
|
|
|
"\u00D8","Ø",
|
|
|
|
|
"\u00D9","Ù",
|
|
|
|
|
"\u00DA","Ú",
|
|
|
|
|
"\u00DB","Û",
|
|
|
|
|
"\u00DC","Ü",
|
|
|
|
|
"\u00DD","Ý",
|
|
|
|
|
"\u00DE","Þ",
|
|
|
|
|
"\u00DF","ß",
|
|
|
|
|
"\u00E0","à",
|
|
|
|
|
"\u00E1","á",
|
|
|
|
|
"\u00E2","â",
|
|
|
|
|
"\u00E3","ã",
|
|
|
|
|
"\u00E4","ä",
|
|
|
|
|
"\u00E5","å",
|
|
|
|
|
"\u00E6","æ",
|
|
|
|
|
"\u00E7","ç",
|
|
|
|
|
"\u00E8","è",
|
|
|
|
|
"\u00E9","é",
|
|
|
|
|
"\u00EA","ê",
|
|
|
|
|
"\u00EB","ë",
|
|
|
|
|
"\u00EC","ì",
|
|
|
|
|
"\u00ED","í",
|
|
|
|
|
"\u00EE","î",
|
|
|
|
|
"\u00EF","ï",
|
|
|
|
|
"\u00F0","ð",
|
|
|
|
|
"\u00F1","ñ",
|
|
|
|
|
"\u00F2","ò",
|
|
|
|
|
"\u00F3","ó",
|
|
|
|
|
"\u00F4","ô",
|
|
|
|
|
"\u00F5","õ",
|
|
|
|
|
"\u00F6","ö",
|
|
|
|
|
"\u00F7","÷",
|
|
|
|
|
"\u00F8","ø",
|
|
|
|
|
"\u00F9","ù",
|
|
|
|
|
"\u00FA","ú",
|
|
|
|
|
"\u00FB","û",
|
|
|
|
|
"\u00FC","ü",
|
|
|
|
|
"\u00FD","ý",
|
|
|
|
|
"\u00FE","þ",
|
|
|
|
|
"\u00FF","ÿ"
|
|
|
|
|
"\u00A1", "¡", //inverted (spanish) exclamation mark
|
|
|
|
|
"\u00A2", "¢", //cent
|
|
|
|
|
"\u00A3", "£", //pound
|
|
|
|
|
"\u00A4", "¤", //currency
|
|
|
|
|
"\u00A5", "¥", //yen
|
|
|
|
|
"\u00A6", "¦", //broken vertical bar
|
|
|
|
|
"\u00A7", "§", //section sign
|
|
|
|
|
"\u00A8", "¨", //diaeresis (umlaut)
|
|
|
|
|
"\u00A9", "©", //copyright sign
|
|
|
|
|
"\u00AA", "ª", //feminine ordinal indicator
|
|
|
|
|
"\u00AB", "«", //left-pointing double angle quotation mark
|
|
|
|
|
"\u00AC", "¬", //not sign
|
|
|
|
|
"\u00AD", "­", //soft hyphen
|
|
|
|
|
"\u00AE", "®", //registered sign
|
|
|
|
|
"\u00AF", "¯", //macron
|
|
|
|
|
"\u00B0", "°", //degree sign
|
|
|
|
|
"\u00B1", "±", //plus-minus sign
|
|
|
|
|
"\u00B2", "²", //superscript two
|
|
|
|
|
"\u00B3", "³", //superscript three
|
|
|
|
|
"\u00B4", "´", //acute accent
|
|
|
|
|
"\u00B5", "µ", //micro sign
|
|
|
|
|
"\u00B6", "¶", //paragraph sign
|
|
|
|
|
"\u00B7", "·", //middle dot
|
|
|
|
|
"\u00B8", "¸", //cedilla
|
|
|
|
|
"\u00B9", "¹", //superscript one
|
|
|
|
|
"\u00BA", "º", //masculine ordinal indicator
|
|
|
|
|
"\u00BB", "»", //right-pointing double angle quotation mark
|
|
|
|
|
"\u00BC", "¼", //fraction 1/4
|
|
|
|
|
"\u00BD", "½", //fraction 1/2
|
|
|
|
|
"\u00BE", "¾", //fraction 3/4
|
|
|
|
|
"\u00BF", "¿", //inverted (spanisch) questionmark
|
|
|
|
|
"\u00C0", "À",
|
|
|
|
|
"\u00C1", "Á",
|
|
|
|
|
"\u00C2", "Â",
|
|
|
|
|
"\u00C3", "Ã",
|
|
|
|
|
"\u00C4", "Ä",
|
|
|
|
|
"\u00C5", "Å",
|
|
|
|
|
"\u00C6", "Æ",
|
|
|
|
|
"\u00C7", "Ç",
|
|
|
|
|
"\u00C8", "È",
|
|
|
|
|
"\u00C9", "É",
|
|
|
|
|
"\u00CA", "Ê",
|
|
|
|
|
"\u00CB", "Ë",
|
|
|
|
|
"\u00CC", "Ì",
|
|
|
|
|
"\u00CD", "Í",
|
|
|
|
|
"\u00CE", "Î",
|
|
|
|
|
"\u00CF", "Ï",
|
|
|
|
|
"\u00D0", "Ð",
|
|
|
|
|
"\u00D1", "Ñ",
|
|
|
|
|
"\u00D2", "Ò",
|
|
|
|
|
"\u00D3", "Ó",
|
|
|
|
|
"\u00D4", "Ô",
|
|
|
|
|
"\u00D5", "Õ",
|
|
|
|
|
"\u00D6", "Ö",
|
|
|
|
|
"\u00D7", "×",
|
|
|
|
|
"\u00D8", "Ø",
|
|
|
|
|
"\u00D9", "Ù",
|
|
|
|
|
"\u00DA", "Ú",
|
|
|
|
|
"\u00DB", "Û",
|
|
|
|
|
"\u00DC", "Ü",
|
|
|
|
|
"\u00DD", "Ý",
|
|
|
|
|
"\u00DE", "Þ",
|
|
|
|
|
"\u00DF", "ß",
|
|
|
|
|
"\u00E0", "à",
|
|
|
|
|
"\u00E1", "á",
|
|
|
|
|
"\u00E2", "â",
|
|
|
|
|
"\u00E3", "ã",
|
|
|
|
|
"\u00E4", "ä",
|
|
|
|
|
"\u00E5", "å",
|
|
|
|
|
"\u00E6", "æ",
|
|
|
|
|
"\u00E7", "ç",
|
|
|
|
|
"\u00E8", "è",
|
|
|
|
|
"\u00E9", "é",
|
|
|
|
|
"\u00EA", "ê",
|
|
|
|
|
"\u00EB", "ë",
|
|
|
|
|
"\u00EC", "ì",
|
|
|
|
|
"\u00ED", "í",
|
|
|
|
|
"\u00EE", "î",
|
|
|
|
|
"\u00EF", "ï",
|
|
|
|
|
"\u00F0", "ð",
|
|
|
|
|
"\u00F1", "ñ",
|
|
|
|
|
"\u00F2", "ò",
|
|
|
|
|
"\u00F3", "ó",
|
|
|
|
|
"\u00F4", "ô",
|
|
|
|
|
"\u00F5", "õ",
|
|
|
|
|
"\u00F6", "ö",
|
|
|
|
|
"\u00F7", "÷",
|
|
|
|
|
"\u00F8", "ø",
|
|
|
|
|
"\u00F9", "ù",
|
|
|
|
|
"\u00FA", "ú",
|
|
|
|
|
"\u00FB", "û",
|
|
|
|
|
"\u00FC", "ü",
|
|
|
|
|
"\u00FD", "ý",
|
|
|
|
|
"\u00FE", "þ",
|
|
|
|
|
"\u00FF", "ÿ"
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
private final static Map<String, Character> html2unicode4xml = new HashMap<String, Character>(mapping4xml.length * 2);
|
|
|
|
|
private final static Map<String, Character> html2unicode4html = new HashMap<String, Character>(mapping4html.length * 2);
|
|
|
|
|
private final static Map<Character, String> unicode2html4xml = new HashMap<Character, String>(mapping4xml.length * 2);
|
|
|
|
|
private final static Map<Character, String> unicode2html4html = new HashMap<Character, String>(mapping4html.length * 2);
|
|
|
|
|
/** Mapping for XML to unicode. */
|
|
|
|
|
private static final Map<String, Character> HTML2UNICODE4XML =
|
|
|
|
|
new HashMap<String, Character>(MAPPING4XML.length * 2);
|
|
|
|
|
/** Mapping for HTML to unicode. */
|
|
|
|
|
private static final Map<String, Character> HTML2UNICODE4HTML =
|
|
|
|
|
new HashMap<String, Character>(MAPPING4HTML.length * 2);
|
|
|
|
|
/** Mapping for unicode to XML. */
|
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4XML =
|
|
|
|
|
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
|
|
|
|
/** Mapping for unicode to HTML. */
|
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4HTML =
|
|
|
|
|
new HashMap<Character, String>(MAPPING4HTML.length * 2);
|
|
|
|
|
static {
|
|
|
|
|
Character c;
|
|
|
|
|
for (int i = 0; i < mapping4html.length; i += 2) {
|
|
|
|
|
c = Character.valueOf(mapping4html[i].charAt(0));
|
|
|
|
|
html2unicode4html.put(mapping4html[i + 1], c);
|
|
|
|
|
unicode2html4html.put(c, mapping4html[i + 1]);
|
|
|
|
|
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
|
|
|
|
|
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
|
|
|
|
|
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
|
|
|
|
|
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < mapping4xml.length; i += 2) {
|
|
|
|
|
c = Character.valueOf(mapping4xml[i].charAt(0));
|
|
|
|
|
html2unicode4xml.put(mapping4xml[i + 1], c);
|
|
|
|
|
unicode2html4xml.put(c, mapping4xml[i + 1]);
|
|
|
|
|
for (int i = 0; i < MAPPING4XML.length; i += 2) {
|
|
|
|
|
c = Character.valueOf(MAPPING4XML[i].charAt(0));
|
|
|
|
|
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
|
|
|
|
|
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String unicode2xml(final String text, boolean amp) {
|
|
|
|
|
/** Private constructor to avoid instantiation of utility
|
|
|
|
|
* class with only static methods.
|
|
|
|
|
*/
|
|
|
|
|
private CharacterCoding() { }
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Replaces characters which have special representation in XML.
|
|
|
|
|
* @see #MAPPING4XML
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
|
* @return text with replaced characters
|
|
|
|
|
*/
|
|
|
|
|
public static String unicode2xml(final String text, final boolean amp) {
|
|
|
|
|
return unicode2html(text, amp, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public static String unicode2html(final String text, boolean amp) {
|
|
|
|
|
/**
|
|
|
|
|
* Replaces characters which have special representation in HTML.
|
|
|
|
|
* @see #MAPPING4HTML
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
|
* @return text with replaced characters
|
|
|
|
|
*/
|
|
|
|
|
public static String unicode2html(final String text, final boolean amp) {
|
|
|
|
|
return unicode2html(text, amp, true);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private static String unicode2html(final String text, boolean amp, boolean html) {
|
|
|
|
|
/**
|
|
|
|
|
* Replaces characters which have special representation in HTML or XML.
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
|
* @param html true if characters shall be replaced for embedding in
|
|
|
|
|
* HTML, false for XML (far more characters are replaced for HTML,
|
|
|
|
|
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
|
|
|
|
|
* @return text with replaced characters
|
|
|
|
|
*/
|
|
|
|
|
private static String unicode2html(
|
|
|
|
|
final String text, final boolean amp, final boolean html) {
|
|
|
|
|
if (text == null) return null;
|
|
|
|
|
final StringBuilder sb = new StringBuilder(text.length() * 12 / 10);
|
|
|
|
|
int textpos = 0;
|
|
|
|
@ -211,17 +256,17 @@ public class CharacterCoding {
|
|
|
|
|
while (textpos < text.length()) {
|
|
|
|
|
// find a (forward) mapping
|
|
|
|
|
c = text.charAt(textpos);
|
|
|
|
|
if (amp && c == amp_unicode) {
|
|
|
|
|
sb.append(amp_html);
|
|
|
|
|
if (amp && c == AMP_UNICODE) {
|
|
|
|
|
sb.append(AMP_HTML);
|
|
|
|
|
textpos++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if ((r = unicode2html4xml.get(c)) != null) {
|
|
|
|
|
if ((r = UNICODE2HTML4XML.get(c)) != null) {
|
|
|
|
|
sb.append(r);
|
|
|
|
|
textpos++;
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (html && (r = unicode2html4html.get(c)) != null) {
|
|
|
|
|
if (html && (r = UNICODE2HTML4HTML.get(c)) != null) {
|
|
|
|
|
sb.append(r);
|
|
|
|
|
textpos++;
|
|
|
|
|
continue;
|
|
|
|
@ -232,6 +277,11 @@ public class CharacterCoding {
|
|
|
|
|
return sb.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Replaces HTML-encoded characters with unicode representation.
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
|
* @return text with replaced characters
|
|
|
|
|
*/
|
|
|
|
|
public static String html2unicode(final String text) {
|
|
|
|
|
if (text == null) return null;
|
|
|
|
|
int p = 0, p1, q;
|
|
|
|
@ -246,7 +296,9 @@ public class CharacterCoding {
|
|
|
|
|
}
|
|
|
|
|
sb.append(text, p, p1);
|
|
|
|
|
p = p1;
|
|
|
|
|
if (p >= text.length()) break;
|
|
|
|
|
if (p >= text.length()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
q = text.indexOf(';', p);
|
|
|
|
|
if (q < 0) {
|
|
|
|
|
// if there is now no semicolon, then this will also fail when another ampersand is found afterwards
|
|
|
|
@ -256,19 +308,19 @@ public class CharacterCoding {
|
|
|
|
|
}
|
|
|
|
|
s = text.substring(p, q + 1);
|
|
|
|
|
p = q + 1;
|
|
|
|
|
if (s.equals(amp_html)) {
|
|
|
|
|
sb.append(amp_unicode);
|
|
|
|
|
if (s.equals(AMP_HTML)) {
|
|
|
|
|
sb.append(AMP_UNICODE);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if (s.equals(space_html)) {
|
|
|
|
|
if (s.equals(SPACE_HTML)) {
|
|
|
|
|
sb.append(" ");
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if ((r = html2unicode4xml.get(s)) != null) {
|
|
|
|
|
if ((r = HTML2UNICODE4XML.get(s)) != null) {
|
|
|
|
|
sb.append(r.charValue());
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
if ((r = html2unicode4html.get(s)) != null) {
|
|
|
|
|
if ((r = HTML2UNICODE4HTML.get(s)) != null) {
|
|
|
|
|
sb.append(r);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
@ -281,7 +333,7 @@ public class CharacterCoding {
|
|
|
|
|
try {
|
|
|
|
|
int uc = Integer.parseInt(ucs);
|
|
|
|
|
sb.append(new char[] {(char) uc});
|
|
|
|
|
} catch (NumberFormatException e) {}
|
|
|
|
|
} catch (NumberFormatException e) { }
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
// the entity is unknown, skip it
|
|
|
|
@ -289,12 +341,19 @@ public class CharacterCoding {
|
|
|
|
|
return sb.toString();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Test method. Ignore it if you don't need it.
|
|
|
|
|
* @param args will be ignored
|
|
|
|
|
*/
|
|
|
|
|
public static void main(final String[] args) {
|
|
|
|
|
final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
|
|
|
|
|
final String text =
|
|
|
|
|
"Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
|
|
|
|
|
final String txet = unicode2html(text, true);
|
|
|
|
|
System.out.println(txet);
|
|
|
|
|
System.out.println(html2unicode(txet));
|
|
|
|
|
if (html2unicode(txet).equals(text)) System.out.println("correct");
|
|
|
|
|
if (html2unicode(txet).equals(text)) {
|
|
|
|
|
System.out.println("correct");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
final String text2 = "encodeUnicode2xml: & \" < >";
|
|
|
|
|
System.out.println(text2);
|
|
|
|
|