From 277b454a62edcbc19834bdcc7a5e5f88398e5e02 Mon Sep 17 00:00:00 2001 From: low012 Date: Sun, 25 Sep 2011 13:16:52 +0000 Subject: [PATCH] *) added comments *) minor refactoring git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7971 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../document/parser/html/CharacterCoding.java | 413 ++++++++++-------- 1 file changed, 236 insertions(+), 177 deletions(-) diff --git a/source/net/yacy/document/parser/html/CharacterCoding.java b/source/net/yacy/document/parser/html/CharacterCoding.java index 0f0bda721..6c3ed1efd 100644 --- a/source/net/yacy/document/parser/html/CharacterCoding.java +++ b/source/net/yacy/document/parser/html/CharacterCoding.java @@ -27,182 +27,227 @@ package net.yacy.document.parser.html; import java.util.HashMap; import java.util.Map; -public class CharacterCoding { +/** + * Contains methods to convert between Unicode and XML/HTML encoding. + */ +public final class CharacterCoding { - private static final char amp_unicode = "\u0026".charAt(0); - private static final String amp_html = "&"; - private static final String space_html = " "; - - private static final String[] mapping4xml = { - "\"",""", //quotation mark - "\u003C","<", //less than - "\u003E",">", //greater than + /** Ampersand character in unicode encoding. */ + private static final char AMP_UNICODE = "\u0026".charAt(0); + /** Ampersand character in HTML encoding. */ + private static final String AMP_HTML = "&"; + /** Space character in HTML encoding. */ + private static final String SPACE_HTML = " "; + + /** Special characters which have to be mapped for XML. */ + private static final String[] MAPPING4XML = { + "\"", """, //quotation mark + "\u003C", "<", //less than + "\u003E", ">", //greater than }; - - private static final String[] mapping4html = { - "\\", "\", // Backslash - "\u005E","^", // Caret - "\u0060","`", // Accent Grave ` - "\u007B","{", // { - "\u007C","|", // | - "\u007D","}", // } - "\u007E","~", // ~ + /** Special characters which have to be mapped for HTML. */ + private static final String[] MAPPING4HTML = { + "\\", "\", // Backslash + "\u005E", "^", // Caret + + "\u0060", "`", // Accent Grave ` + "\u007B", "{", // { + "\u007C", "|", // | + "\u007D", "}", // } + "\u007E", "~", // ~ - "\u0082","‚", - "\u0083","ƒ", - "\u0084","„", - "\u0085","…", - "\u0086","†", - "\u0087","‡", - "\u0088","ˆ", - "\u0089","‰", - "\u008A","Š", - "\u008B","‹", - "\u008C","Œ", - "\u008D","", - "\u008E","Ž", + "\u0082", "‚", + "\u0083", "ƒ", + "\u0084", "„", + "\u0085", "…", + "\u0086", "†", + "\u0087", "‡", + "\u0088", "ˆ", + "\u0089", "‰", + "\u008A", "Š", + "\u008B", "‹", + "\u008C", "Œ", + "\u008D", "", + "\u008E", "Ž", - "\u0091","‘", - "\u0092","’", - "\u0093","“", - "\u0094","”", - "\u0095","•", - "\u0096","–", - "\u0097","—", - "\u0098","˜", - "\u0099","™", - "\u009A","š", - "\u009B","›", - "\u009C","œ", - "\u009D","", - "\u009E","ž", - "\u009F","Ÿ", + "\u0091", "‘", + "\u0092", "’", + "\u0093", "“", + "\u0094", "”", + "\u0095", "•", + "\u0096", "–", + "\u0097", "—", + "\u0098", "˜", + "\u0099", "™", + "\u009A", "š", + "\u009B", "›", + "\u009C", "œ", + "\u009D", "", + "\u009E", "ž", + "\u009F", "Ÿ", - "\u00A1","¡", //inverted (spanish) exclamation mark - "\u00A2","¢", //cent - "\u00A3","£", //pound - "\u00A4","¤", //currency - "\u00A5","¥", //yen - "\u00A6","¦", //broken vertical bar - "\u00A7","§", //section sign - "\u00A8","¨", //diaeresis (umlaut) - "\u00A9","©", //copyright sign - "\u00AA","ª", //feminine ordinal indicator - "\u00AB","«", //left-pointing double angle quotation mark - "\u00AC","¬", //not sign - "\u00AD","­", //soft hyphen - "\u00AE","®", //registered sign - "\u00AF","¯", //macron - "\u00B0","°", //degree sign - "\u00B1","±", //plus-minus sign - "\u00B2","²", //superscript two - "\u00B3","³", //superscript three - "\u00B4","´", //acute accent - "\u00B5","µ", //micro sign - "\u00B6","¶", //paragraph sign - "\u00B7","·", //middle dot - "\u00B8","¸", //cedilla - "\u00B9","¹", //superscript one - "\u00BA","º", //masculine ordinal indicator - "\u00BB","»", //right-pointing double angle quotation mark - "\u00BC","¼", //fraction 1/4 - "\u00BD","½", //fraction 1/2 - "\u00BE","¾", //fraction 3/4 - "\u00BF","¿", //inverted (spanisch) questionmark - "\u00C0","À", - "\u00C1","Á", - "\u00C2","Â", - "\u00C3","Ã", - "\u00C4","Ä", - "\u00C5","Å", - "\u00C6","Æ", - "\u00C7","Ç", - "\u00C8","È", - "\u00C9","É", - "\u00CA","Ê", - "\u00CB","Ë", - "\u00CC","Ì", - "\u00CD","Í", - "\u00CE","Î", - "\u00CF","Ï", - "\u00D0","Ð", - "\u00D1","Ñ", - "\u00D2","Ò", - "\u00D3","Ó", - "\u00D4","Ô", - "\u00D5","Õ", - "\u00D6","Ö", - "\u00D7","×", - "\u00D8","Ø", - "\u00D9","Ù", - "\u00DA","Ú", - "\u00DB","Û", - "\u00DC","Ü", - "\u00DD","Ý", - "\u00DE","Þ", - "\u00DF","ß", - "\u00E0","à", - "\u00E1","á", - "\u00E2","â", - "\u00E3","ã", - "\u00E4","ä", - "\u00E5","å", - "\u00E6","æ", - "\u00E7","ç", - "\u00E8","è", - "\u00E9","é", - "\u00EA","ê", - "\u00EB","ë", - "\u00EC","ì", - "\u00ED","í", - "\u00EE","î", - "\u00EF","ï", - "\u00F0","ð", - "\u00F1","ñ", - "\u00F2","ò", - "\u00F3","ó", - "\u00F4","ô", - "\u00F5","õ", - "\u00F6","ö", - "\u00F7","÷", - "\u00F8","ø", - "\u00F9","ù", - "\u00FA","ú", - "\u00FB","û", - "\u00FC","ü", - "\u00FD","ý", - "\u00FE","þ", - "\u00FF","ÿ" + "\u00A1", "¡", //inverted (spanish) exclamation mark + "\u00A2", "¢", //cent + "\u00A3", "£", //pound + "\u00A4", "¤", //currency + "\u00A5", "¥", //yen + "\u00A6", "¦", //broken vertical bar + "\u00A7", "§", //section sign + "\u00A8", "¨", //diaeresis (umlaut) + "\u00A9", "©", //copyright sign + "\u00AA", "ª", //feminine ordinal indicator + "\u00AB", "«", //left-pointing double angle quotation mark + "\u00AC", "¬", //not sign + "\u00AD", "­", //soft hyphen + "\u00AE", "®", //registered sign + "\u00AF", "¯", //macron + "\u00B0", "°", //degree sign + "\u00B1", "±", //plus-minus sign + "\u00B2", "²", //superscript two + "\u00B3", "³", //superscript three + "\u00B4", "´", //acute accent + "\u00B5", "µ", //micro sign + "\u00B6", "¶", //paragraph sign + "\u00B7", "·", //middle dot + "\u00B8", "¸", //cedilla + "\u00B9", "¹", //superscript one + "\u00BA", "º", //masculine ordinal indicator + "\u00BB", "»", //right-pointing double angle quotation mark + "\u00BC", "¼", //fraction 1/4 + "\u00BD", "½", //fraction 1/2 + "\u00BE", "¾", //fraction 3/4 + "\u00BF", "¿", //inverted (spanisch) questionmark + "\u00C0", "À", + "\u00C1", "Á", + "\u00C2", "Â", + "\u00C3", "Ã", + "\u00C4", "Ä", + "\u00C5", "Å", + "\u00C6", "Æ", + "\u00C7", "Ç", + "\u00C8", "È", + "\u00C9", "É", + "\u00CA", "Ê", + "\u00CB", "Ë", + "\u00CC", "Ì", + "\u00CD", "Í", + "\u00CE", "Î", + "\u00CF", "Ï", + "\u00D0", "Ð", + "\u00D1", "Ñ", + "\u00D2", "Ò", + "\u00D3", "Ó", + "\u00D4", "Ô", + "\u00D5", "Õ", + "\u00D6", "Ö", + "\u00D7", "×", + "\u00D8", "Ø", + "\u00D9", "Ù", + "\u00DA", "Ú", + "\u00DB", "Û", + "\u00DC", "Ü", + "\u00DD", "Ý", + "\u00DE", "Þ", + "\u00DF", "ß", + "\u00E0", "à", + "\u00E1", "á", + "\u00E2", "â", + "\u00E3", "ã", + "\u00E4", "ä", + "\u00E5", "å", + "\u00E6", "æ", + "\u00E7", "ç", + "\u00E8", "è", + "\u00E9", "é", + "\u00EA", "ê", + "\u00EB", "ë", + "\u00EC", "ì", + "\u00ED", "í", + "\u00EE", "î", + "\u00EF", "ï", + "\u00F0", "ð", + "\u00F1", "ñ", + "\u00F2", "ò", + "\u00F3", "ó", + "\u00F4", "ô", + "\u00F5", "õ", + "\u00F6", "ö", + "\u00F7", "÷", + "\u00F8", "ø", + "\u00F9", "ù", + "\u00FA", "ú", + "\u00FB", "û", + "\u00FC", "ü", + "\u00FD", "ý", + "\u00FE", "þ", + "\u00FF", "ÿ" }; - - private final static Map html2unicode4xml = new HashMap(mapping4xml.length * 2); - private final static Map html2unicode4html = new HashMap(mapping4html.length * 2); - private final static Map unicode2html4xml = new HashMap(mapping4xml.length * 2); - private final static Map unicode2html4html = new HashMap(mapping4html.length * 2); + + /** Mapping for XML to unicode. */ + private static final Map HTML2UNICODE4XML = + new HashMap(MAPPING4XML.length * 2); + /** Mapping for HTML to unicode. */ + private static final Map HTML2UNICODE4HTML = + new HashMap(MAPPING4HTML.length * 2); + /** Mapping for unicode to XML. */ + private static final Map UNICODE2HTML4XML = + new HashMap(MAPPING4XML.length * 2); + /** Mapping for unicode to HTML. */ + private static final Map UNICODE2HTML4HTML = + new HashMap(MAPPING4HTML.length * 2); static { Character c; - for (int i = 0; i < mapping4html.length; i += 2) { - c = Character.valueOf(mapping4html[i].charAt(0)); - html2unicode4html.put(mapping4html[i + 1], c); - unicode2html4html.put(c, mapping4html[i + 1]); + for (int i = 0; i < MAPPING4HTML.length; i += 2) { + c = Character.valueOf(MAPPING4HTML[i].charAt(0)); + HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c); + UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]); } - for (int i = 0; i < mapping4xml.length; i += 2) { - c = Character.valueOf(mapping4xml[i].charAt(0)); - html2unicode4xml.put(mapping4xml[i + 1], c); - unicode2html4xml.put(c, mapping4xml[i + 1]); + for (int i = 0; i < MAPPING4XML.length; i += 2) { + c = Character.valueOf(MAPPING4XML[i].charAt(0)); + HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c); + UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]); } } - - public static String unicode2xml(final String text, boolean amp) { + + /** Private constructor to avoid instantiation of utility + * class with only static methods. + */ + private CharacterCoding() { } + + /** + * Replaces characters which have special representation in XML. + * @see #MAPPING4XML + * @param text text with character to replace + * @param amp true if ampersands shall be replaced, else false + * @return text with replaced characters + */ + public static String unicode2xml(final String text, final boolean amp) { return unicode2html(text, amp, false); } - - public static String unicode2html(final String text, boolean amp) { + + /** + * Replaces characters which have special representation in HTML. + * @see #MAPPING4HTML + * @param text text with character to replace + * @param amp true if ampersands shall be replaced, else false + * @return text with replaced characters + */ + public static String unicode2html(final String text, final boolean amp) { return unicode2html(text, amp, true); } - - private static String unicode2html(final String text, boolean amp, boolean html) { + + /** + * Replaces characters which have special representation in HTML or XML. + * @param text text with character to replace + * @param amp true if ampersands shall be replaced, else false + * @param html true if characters shall be replaced for embedding in + * HTML, false for XML (far more characters are replaced for HTML, + * compare {@link #MAPPING4HTML} with {@link #MAPPING4XML} + * @return text with replaced characters + */ + private static String unicode2html( + final String text, final boolean amp, final boolean html) { if (text == null) return null; final StringBuilder sb = new StringBuilder(text.length() * 12 / 10); int textpos = 0; @@ -211,17 +256,17 @@ public class CharacterCoding { while (textpos < text.length()) { // find a (forward) mapping c = text.charAt(textpos); - if (amp && c == amp_unicode) { - sb.append(amp_html); + if (amp && c == AMP_UNICODE) { + sb.append(AMP_HTML); textpos++; continue; } - if ((r = unicode2html4xml.get(c)) != null) { + if ((r = UNICODE2HTML4XML.get(c)) != null) { sb.append(r); textpos++; continue; } - if (html && (r = unicode2html4html.get(c)) != null) { + if (html && (r = UNICODE2HTML4HTML.get(c)) != null) { sb.append(r); textpos++; continue; @@ -231,7 +276,12 @@ public class CharacterCoding { } return sb.toString(); } - + + /** + * Replaces HTML-encoded characters with unicode representation. + * @param text text with character to replace + * @return text with replaced characters + */ public static String html2unicode(final String text) { if (text == null) return null; int p = 0, p1, q; @@ -246,7 +296,9 @@ public class CharacterCoding { } sb.append(text, p, p1); p = p1; - if (p >= text.length()) break; + if (p >= text.length()) { + break; + } q = text.indexOf(';', p); if (q < 0) { // if there is now no semicolon, then this will also fail when another ampersand is found afterwards @@ -256,19 +308,19 @@ public class CharacterCoding { } s = text.substring(p, q + 1); p = q + 1; - if (s.equals(amp_html)) { - sb.append(amp_unicode); + if (s.equals(AMP_HTML)) { + sb.append(AMP_UNICODE); continue; } - if (s.equals(space_html)) { + if (s.equals(SPACE_HTML)) { sb.append(" "); continue; } - if ((r = html2unicode4xml.get(s)) != null) { + if ((r = HTML2UNICODE4XML.get(s)) != null) { sb.append(r.charValue()); continue; } - if ((r = html2unicode4html.get(s)) != null) { + if ((r = HTML2UNICODE4HTML.get(s)) != null) { sb.append(r); continue; } @@ -279,9 +331,9 @@ public class CharacterCoding { } String ucs = s.substring(2, s.length() - 1); try { - int uc = Integer.parseInt(ucs); - sb.append(new char[] {(char) uc}); - } catch (NumberFormatException e) {} + int uc = Integer.parseInt(ucs); + sb.append(new char[] {(char) uc}); + } catch (NumberFormatException e) { } continue; } // the entity is unknown, skip it @@ -289,13 +341,20 @@ public class CharacterCoding { return sb.toString(); } + /** + * Test method. Ignore it if you don't need it. + * @param args will be ignored + */ public static void main(final String[] args) { - final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen"; + final String text = + "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen"; final String txet = unicode2html(text, true); System.out.println(txet); System.out.println(html2unicode(txet)); - if (html2unicode(txet).equals(text)) System.out.println("correct"); - + if (html2unicode(txet).equals(text)) { + System.out.println("correct"); + } + final String text2 = "encodeUnicode2xml: & \" < >"; System.out.println(text2); System.out.println(unicode2xml(text2, true));