// CharacterCoding.java
// ----------------------------------
// (C) 22.10.2008 by Michael Peter Christen; mc@yacy.net
// first published on http://yacy.net
// Frankfurt, Germany, 2008
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.document.parser.html;

import java.util.HashMap;

public class CharacterCoding {

    private static final char amp_unicode = "\u0026".charAt(0);
    private static final String amp_html = "&";
    private static final String space_html = " ";
    
    private static final String[] mapping4xml = {
        "\"",""",      //quotation mark
        "\u003C","<",    //less than
        "\u003E",">",    //greater than
    };
    
    private static final String[] mapping4html = {
        "\\",    "\",  // Backslash
        "\u005E","^",  // Caret

        "\u0060","`",  // Accent Grave `
        "\u007B","{",  // {
        "\u007C","|",  // |
        "\u007D","}",  // }
        "\u007E","~",  // ~

        "\u0082","‚",
        "\u0083","ƒ",
        "\u0084","„",
        "\u0085","…",
        "\u0086","†",
        "\u0087","‡",
        "\u0088","ˆ",
        "\u0089","‰",
        "\u008A","Š",
        "\u008B","‹",
        "\u008C","Œ",
        "\u008D","",
        "\u008E","Ž",

        "\u0091","‘",
        "\u0092","’",
        "\u0093","“",
        "\u0094","”",
        "\u0095","•",
        "\u0096","–",
        "\u0097","—",
        "\u0098","˜",
        "\u0099","™",
        "\u009A","š",
        "\u009B","›",
        "\u009C","œ",
        "\u009D","",
        "\u009E","ž",
        "\u009F","Ÿ",

        "\u00A1","¡",    //inverted (spanish) exclamation mark
        "\u00A2","¢",     //cent
        "\u00A3","£",    //pound
        "\u00A4","¤",   //currency
        "\u00A5","¥",      //yen
        "\u00A6","¦",   //broken vertical bar
        "\u00A7","§",     //section sign
        "\u00A8","¨",      //diaeresis (umlaut)
        "\u00A9","©",     //copyright sign
        "\u00AA","ª",     //feminine ordinal indicator
        "\u00AB","«",    //left-pointing double angle quotation mark
        "\u00AC","¬",      //not sign
        "\u00AD","­",      //soft hyphen
        "\u00AE","®",      //registered sign
        "\u00AF","¯",     //macron
        "\u00B0","°",      //degree sign
        "\u00B1","±",   //plus-minus sign
        "\u00B2","²",     //superscript two
        "\u00B3","³",     //superscript three
        "\u00B4","´",    //acute accent
        "\u00B5","µ",    //micro sign
        "\u00B6","¶",     //paragraph sign
        "\u00B7","·",   //middle dot
        "\u00B8","¸",    //cedilla
        "\u00B9","¹",     //superscript one
        "\u00BA","º",     //masculine ordinal indicator
        "\u00BB","»",    //right-pointing double angle quotation mark
        "\u00BC","¼",   //fraction 1/4
        "\u00BD","½",   //fraction 1/2
        "\u00BE","¾",   //fraction 3/4
        "\u00BF","¿",   //inverted (spanisch) questionmark
        "\u00C0","À",
        "\u00C1","Á",
        "\u00C2","Â",
        "\u00C3","Ã",
        "\u00C4","Ä",
        "\u00C5","Å",
        "\u00C6","Æ",
        "\u00C7","Ç",
        "\u00C8","È",
        "\u00C9","É",
        "\u00CA","Ê",
        "\u00CB","Ë",
        "\u00CC","Ì",
        "\u00CD","Í",
        "\u00CE","Î",
        "\u00CF","Ï",
        "\u00D0","Ð",
        "\u00D1","Ñ",
        "\u00D2","Ò",
        "\u00D3","Ó",
        "\u00D4","Ô",
        "\u00D5","Õ",
        "\u00D6","Ö",
        "\u00D7","×",
        "\u00D8","Ø",
        "\u00D9","Ù",
        "\u00DA","Ú",
        "\u00DB","Û",
        "\u00DC","Ü",
        "\u00DD","Ý",
        "\u00DE","Þ",
        "\u00DF","ß",
        "\u00E0","à",
        "\u00E1","á",
        "\u00E2","â",
        "\u00E3","ã",
        "\u00E4","ä",
        "\u00E5","å",
        "\u00E6","æ",
        "\u00E7","ç",
        "\u00E8","è",
        "\u00E9","é",
        "\u00EA","ê",
        "\u00EB","ë",
        "\u00EC","ì",
        "\u00ED","í",
        "\u00EE","î",
        "\u00EF","ï",
        "\u00F0","ð",
        "\u00F1","ñ",
        "\u00F2","ò",
        "\u00F3","ó",
        "\u00F4","ô",
        "\u00F5","õ",
        "\u00F6","ö",
        "\u00F7","÷",
        "\u00F8","ø",
        "\u00F9","ù",
        "\u00FA","ú",
        "\u00FB","û",
        "\u00FC","ü",
        "\u00FD","ý",
        "\u00FE","þ",
        "\u00FF","ÿ"
    };
    
    private final static HashMap<String, Character> html2unicode4xml = new HashMap<String, Character>();
    private final static HashMap<String, Character> html2unicode4html = new HashMap<String, Character>();
    private final static HashMap<Character, String> unicode2html4xml = new HashMap<Character, String>();
    private final static HashMap<Character, String> unicode2html4html = new HashMap<Character, String>();
    static {
        Character c;
        for (int i = 0; i < mapping4html.length; i += 2) {
            c = Character.valueOf(mapping4html[i].charAt(0));
            html2unicode4html.put(mapping4html[i + 1], c);
            unicode2html4html.put(c, mapping4html[i + 1]);
        }
        for (int i = 0; i < mapping4xml.length; i += 2) {
            c = Character.valueOf(mapping4xml[i].charAt(0));
            html2unicode4xml.put(mapping4xml[i + 1], c);
            unicode2html4xml.put(c, mapping4xml[i + 1]);
        }
    }
    
    public static String unicode2xml(final String text, boolean amp) {
        return unicode2html(text, amp, false);
    }
    
    public static String unicode2html(final String text, boolean amp) {
        return unicode2html(text, amp, true);
    }
    
    private static String unicode2html(final String text, boolean amp, boolean html) {
        if (text == null) return null;
        final StringBuilder sb = new StringBuilder(text.length() * 12 / 10);
        int textpos = 0;
        String r;
        char c;
        while (textpos < text.length()) {
            // find a (forward) mapping
            c = text.charAt(textpos);
            if (amp &&  c == amp_unicode) {
                sb.append(amp_html);
                textpos++;
                continue;
            }
            if ((r = unicode2html4xml.get(c)) != null) {
                sb.append(r);
                textpos++;
                continue;
            }
            if (html && (r = unicode2html4html.get(c)) != null) {
                sb.append(r);
                textpos++;
                continue;
            }
            sb.append(c);
            textpos++;
        }
        return sb.toString();
    }
    
    public static String html2unicode(final String text) {
        if (text == null) return null;
        int p = 0, p1, q;
        final StringBuilder sb = new StringBuilder(text.length());
        String s;
        Character r;
        while (p < text.length()) {
            p1 = text.indexOf('&', p);
            if (p1 < 0) p1 = text.length();
            sb.append(text.subSequence(p, p1));
            p = p1;
            if (p >= text.length()) break;
            q = text.indexOf(';', p);
            if (q < 0) {
                p++;
                continue;
            }
            s = text.substring(p, q + 1);
            p = q + 1;
            if (s.equals(amp_html)) {
                sb.append(amp_unicode);
                continue;
            }
            if (s.equals(space_html)) {
                sb.append(" ");
                continue;
            }
            if ((r = html2unicode4xml.get(s)) != null) {
                sb.append(r.charValue());
                continue;
            }
            if ((r = html2unicode4html.get(s)) != null) {
                sb.append(r);
                continue;
            }
            if (s.charAt(1) == '#') {
                if (s.charAt(2) == 'x' || s.charAt(2) == 'X') {
                    sb.append(new char[] {(char) Integer.parseInt(s.substring(3, s.length() - 1), 16)});
                    continue;
                }
                String ucs = s.substring(2, s.length() - 1);
                try {
                	int uc = Integer.parseInt(ucs);
                	sb.append(new char[] {(char) uc});
                } catch (NumberFormatException e) {}
                continue;
            }
            // the entity is unknown, skip it
        }
        return sb.toString();
    }

    public static void main(final String[] args) {
        final String text = "Test-Text mit & um zyklische &uuml; &amp; Ersetzungen auszuschliessen";
        final String txet = unicode2html(text, true);
        System.out.println(txet);
        System.out.println(html2unicode(txet));
        if (html2unicode(txet).equals(text)) System.out.println("correct");
        
        final String text2 = "encodeUnicode2xml: & \" < >";
        System.out.println(text2);
        System.out.println(unicode2xml(text2, true));

        final String text3 = "space&nbsp;t&auml;st";
        System.out.println(text3);
        System.out.println(html2unicode(text3));
    }
}