*) added comments

*) minor refactoring

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7971 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 14 years ago
parent 6b22865dbc
commit 277b454a62

@ -27,182 +27,227 @@ package net.yacy.document.parser.html;
import java.util.HashMap;
import java.util.Map;
public class CharacterCoding {
/**
* Contains methods to convert between Unicode and XML/HTML encoding.
*/
public final class CharacterCoding {
private static final char amp_unicode = "\u0026".charAt(0);
private static final String amp_html = "&";
private static final String space_html = " ";
private static final String[] mapping4xml = {
"\"",""", //quotation mark
"\u003C","<", //less than
"\u003E",">", //greater than
/** Ampersand character in unicode encoding. */
private static final char AMP_UNICODE = "\u0026".charAt(0);
/** Ampersand character in HTML encoding. */
private static final String AMP_HTML = "&";
/** Space character in HTML encoding. */
private static final String SPACE_HTML = " ";
/** Special characters which have to be mapped for XML. */
private static final String[] MAPPING4XML = {
"\"", """, //quotation mark
"\u003C", "<", //less than
"\u003E", ">", //greater than
};
private static final String[] mapping4html = {
"\\", "\", // Backslash
"\u005E","^", // Caret
"\u0060","`", // Accent Grave `
"\u007B","{", // {
"\u007C","|", // |
"\u007D","}", // }
"\u007E","~", // ~
/** Special characters which have to be mapped for HTML. */
private static final String[] MAPPING4HTML = {
"\\", "\", // Backslash
"\u005E", "^", // Caret
"\u0060", "`", // Accent Grave `
"\u007B", "{", // {
"\u007C", "|", // |
"\u007D", "}", // }
"\u007E", "~", // ~
"\u0082","‚",
"\u0083","ƒ",
"\u0084","„",
"\u0085","…",
"\u0086","†",
"\u0087","‡",
"\u0088","ˆ",
"\u0089","‰",
"\u008A","Š",
"\u008B","‹",
"\u008C","Œ",
"\u008D","",
"\u008E","Ž",
"\u0082", "‚",
"\u0083", "ƒ",
"\u0084", "„",
"\u0085", "…",
"\u0086", "†",
"\u0087", "‡",
"\u0088", "ˆ",
"\u0089", "‰",
"\u008A", "Š",
"\u008B", "‹",
"\u008C", "Œ",
"\u008D", "",
"\u008E", "Ž",
"\u0091","‘",
"\u0092","’",
"\u0093","“",
"\u0094","”",
"\u0095","•",
"\u0096","–",
"\u0097","—",
"\u0098","˜",
"\u0099","™",
"\u009A","š",
"\u009B","›",
"\u009C","œ",
"\u009D","",
"\u009E","ž",
"\u009F","Ÿ",
"\u0091", "‘",
"\u0092", "’",
"\u0093", "“",
"\u0094", "”",
"\u0095", "•",
"\u0096", "–",
"\u0097", "—",
"\u0098", "˜",
"\u0099", "™",
"\u009A", "š",
"\u009B", "›",
"\u009C", "œ",
"\u009D", "",
"\u009E", "ž",
"\u009F", "Ÿ",
"\u00A1","¡", //inverted (spanish) exclamation mark
"\u00A2","¢", //cent
"\u00A3","£", //pound
"\u00A4","¤", //currency
"\u00A5","¥", //yen
"\u00A6","¦", //broken vertical bar
"\u00A7","§", //section sign
"\u00A8","¨", //diaeresis (umlaut)
"\u00A9","©", //copyright sign
"\u00AA","ª", //feminine ordinal indicator
"\u00AB","«", //left-pointing double angle quotation mark
"\u00AC","¬", //not sign
"\u00AD","­", //soft hyphen
"\u00AE","®", //registered sign
"\u00AF","¯", //macron
"\u00B0","°", //degree sign
"\u00B1","±", //plus-minus sign
"\u00B2","²", //superscript two
"\u00B3","³", //superscript three
"\u00B4","´", //acute accent
"\u00B5","µ", //micro sign
"\u00B6","¶", //paragraph sign
"\u00B7","·", //middle dot
"\u00B8","¸", //cedilla
"\u00B9","¹", //superscript one
"\u00BA","º", //masculine ordinal indicator
"\u00BB","»", //right-pointing double angle quotation mark
"\u00BC","¼", //fraction 1/4
"\u00BD","½", //fraction 1/2
"\u00BE","¾", //fraction 3/4
"\u00BF","¿", //inverted (spanisch) questionmark
"\u00C0","À",
"\u00C1","Á",
"\u00C2","Â",
"\u00C3","Ã",
"\u00C4","Ä",
"\u00C5","Å",
"\u00C6","Æ",
"\u00C7","Ç",
"\u00C8","È",
"\u00C9","É",
"\u00CA","Ê",
"\u00CB","Ë",
"\u00CC","Ì",
"\u00CD","Í",
"\u00CE","Î",
"\u00CF","Ï",
"\u00D0","Ð",
"\u00D1","Ñ",
"\u00D2","Ò",
"\u00D3","Ó",
"\u00D4","Ô",
"\u00D5","Õ",
"\u00D6","Ö",
"\u00D7","×",
"\u00D8","Ø",
"\u00D9","Ù",
"\u00DA","Ú",
"\u00DB","Û",
"\u00DC","Ü",
"\u00DD","Ý",
"\u00DE","Þ",
"\u00DF","ß",
"\u00E0","à",
"\u00E1","á",
"\u00E2","â",
"\u00E3","ã",
"\u00E4","ä",
"\u00E5","å",
"\u00E6","æ",
"\u00E7","ç",
"\u00E8","è",
"\u00E9","é",
"\u00EA","ê",
"\u00EB","ë",
"\u00EC","ì",
"\u00ED","í",
"\u00EE","î",
"\u00EF","ï",
"\u00F0","ð",
"\u00F1","ñ",
"\u00F2","ò",
"\u00F3","ó",
"\u00F4","ô",
"\u00F5","õ",
"\u00F6","ö",
"\u00F7","÷",
"\u00F8","ø",
"\u00F9","ù",
"\u00FA","ú",
"\u00FB","û",
"\u00FC","ü",
"\u00FD","ý",
"\u00FE","þ",
"\u00FF","ÿ"
"\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent
"\u00A3", "£", //pound
"\u00A4", "¤", //currency
"\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign
"\u00AF", "¯", //macron
"\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À",
"\u00C1", "Á",
"\u00C2", "Â",
"\u00C3", "Ã",
"\u00C4", "Ä",
"\u00C5", "Å",
"\u00C6", "Æ",
"\u00C7", "Ç",
"\u00C8", "È",
"\u00C9", "É",
"\u00CA", "Ê",
"\u00CB", "Ë",
"\u00CC", "Ì",
"\u00CD", "Í",
"\u00CE", "Î",
"\u00CF", "Ï",
"\u00D0", "Ð",
"\u00D1", "Ñ",
"\u00D2", "Ò",
"\u00D3", "Ó",
"\u00D4", "Ô",
"\u00D5", "Õ",
"\u00D6", "Ö",
"\u00D7", "×",
"\u00D8", "Ø",
"\u00D9", "Ù",
"\u00DA", "Ú",
"\u00DB", "Û",
"\u00DC", "Ü",
"\u00DD", "Ý",
"\u00DE", "Þ",
"\u00DF", "ß",
"\u00E0", "à",
"\u00E1", "á",
"\u00E2", "â",
"\u00E3", "ã",
"\u00E4", "ä",
"\u00E5", "å",
"\u00E6", "æ",
"\u00E7", "ç",
"\u00E8", "è",
"\u00E9", "é",
"\u00EA", "ê",
"\u00EB", "ë",
"\u00EC", "ì",
"\u00ED", "í",
"\u00EE", "î",
"\u00EF", "ï",
"\u00F0", "ð",
"\u00F1", "ñ",
"\u00F2", "ò",
"\u00F3", "ó",
"\u00F4", "ô",
"\u00F5", "õ",
"\u00F6", "ö",
"\u00F7", "÷",
"\u00F8", "ø",
"\u00F9", "ù",
"\u00FA", "ú",
"\u00FB", "û",
"\u00FC", "ü",
"\u00FD", "ý",
"\u00FE", "þ",
"\u00FF", "ÿ"
};
private final static Map<String, Character> html2unicode4xml = new HashMap<String, Character>(mapping4xml.length * 2);
private final static Map<String, Character> html2unicode4html = new HashMap<String, Character>(mapping4html.length * 2);
private final static Map<Character, String> unicode2html4xml = new HashMap<Character, String>(mapping4xml.length * 2);
private final static Map<Character, String> unicode2html4html = new HashMap<Character, String>(mapping4html.length * 2);
/** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2);
/** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2);
/** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2);
static {
Character c;
for (int i = 0; i < mapping4html.length; i += 2) {
c = Character.valueOf(mapping4html[i].charAt(0));
html2unicode4html.put(mapping4html[i + 1], c);
unicode2html4html.put(c, mapping4html[i + 1]);
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
}
for (int i = 0; i < mapping4xml.length; i += 2) {
c = Character.valueOf(mapping4xml[i].charAt(0));
html2unicode4xml.put(mapping4xml[i + 1], c);
unicode2html4xml.put(c, mapping4xml[i + 1]);
for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0));
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
}
}
public static String unicode2xml(final String text, boolean amp) {
/** Private constructor to avoid instantiation of utility
* class with only static methods.
*/
private CharacterCoding() { }
/**
* Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
*/
public static String unicode2xml(final String text, final boolean amp) {
return unicode2html(text, amp, false);
}
public static String unicode2html(final String text, boolean amp) {
/**
* Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
*/
public static String unicode2html(final String text, final boolean amp) {
return unicode2html(text, amp, true);
}
private static String unicode2html(final String text, boolean amp, boolean html) {
/**
* Replaces characters which have special representation in HTML or XML.
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
* @return text with replaced characters
*/
private static String unicode2html(
final String text, final boolean amp, final boolean html) {
if (text == null) return null;
final StringBuilder sb = new StringBuilder(text.length() * 12 / 10);
int textpos = 0;
@ -211,17 +256,17 @@ public class CharacterCoding {
while (textpos < text.length()) {
// find a (forward) mapping
c = text.charAt(textpos);
if (amp && c == amp_unicode) {
sb.append(amp_html);
if (amp && c == AMP_UNICODE) {
sb.append(AMP_HTML);
textpos++;
continue;
}
if ((r = unicode2html4xml.get(c)) != null) {
if ((r = UNICODE2HTML4XML.get(c)) != null) {
sb.append(r);
textpos++;
continue;
}
if (html && (r = unicode2html4html.get(c)) != null) {
if (html && (r = UNICODE2HTML4HTML.get(c)) != null) {
sb.append(r);
textpos++;
continue;
@ -231,7 +276,12 @@ public class CharacterCoding {
}
return sb.toString();
}
/**
* Replaces HTML-encoded characters with unicode representation.
* @param text text with character to replace
* @return text with replaced characters
*/
public static String html2unicode(final String text) {
if (text == null) return null;
int p = 0, p1, q;
@ -246,7 +296,9 @@ public class CharacterCoding {
}
sb.append(text, p, p1);
p = p1;
if (p >= text.length()) break;
if (p >= text.length()) {
break;
}
q = text.indexOf(';', p);
if (q < 0) {
// if there is now no semicolon, then this will also fail when another ampersand is found afterwards
@ -256,19 +308,19 @@ public class CharacterCoding {
}
s = text.substring(p, q + 1);
p = q + 1;
if (s.equals(amp_html)) {
sb.append(amp_unicode);
if (s.equals(AMP_HTML)) {
sb.append(AMP_UNICODE);
continue;
}
if (s.equals(space_html)) {
if (s.equals(SPACE_HTML)) {
sb.append(" ");
continue;
}
if ((r = html2unicode4xml.get(s)) != null) {
if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue());
continue;
}
if ((r = html2unicode4html.get(s)) != null) {
if ((r = HTML2UNICODE4HTML.get(s)) != null) {
sb.append(r);
continue;
}
@ -279,9 +331,9 @@ public class CharacterCoding {
}
String ucs = s.substring(2, s.length() - 1);
try {
int uc = Integer.parseInt(ucs);
sb.append(new char[] {(char) uc});
} catch (NumberFormatException e) {}
int uc = Integer.parseInt(ucs);
sb.append(new char[] {(char) uc});
} catch (NumberFormatException e) { }
continue;
}
// the entity is unknown, skip it
@ -289,13 +341,20 @@ public class CharacterCoding {
return sb.toString();
}
/**
* Test method. Ignore it if you don't need it.
* @param args will be ignored
*/
public static void main(final String[] args) {
final String text = "Test-Text mit & um zyklische &uuml; &amp; Ersetzungen auszuschliessen";
final String text =
"Test-Text mit & um zyklische &uuml; &amp; Ersetzungen auszuschliessen";
final String txet = unicode2html(text, true);
System.out.println(txet);
System.out.println(html2unicode(txet));
if (html2unicode(txet).equals(text)) System.out.println("correct");
if (html2unicode(txet).equals(text)) {
System.out.println("correct");
}
final String text2 = "encodeUnicode2xml: & \" < >";
System.out.println(text2);
System.out.println(unicode2xml(text2, true));

Loading…
Cancel
Save