Replace hardcoded html/xml entities with a file, support decoding all defined HTML entities

pull/405/head
jfhs 4 years ago
parent f8cbaeef93
commit 2135d259e3

File diff suppressed because it is too large Load Diff

@ -24,7 +24,20 @@
package net.yacy.document.parser.html; package net.yacy.document.parser.html;
import net.yacy.search.Switchboard;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -44,171 +57,187 @@ public final class CharacterCoding {
/** Special characters which have to be mapped for XML. */ /** Special characters which have to be mapped for XML. */
private static final String[] MAPPING4XML = { private static final String[] MAPPING4XML = {
"\"", """, //quotation mark "\"", """, //quotation mark
"\u003C", "<", //less than "\u003C", "<", //less than
"\u003E", ">", //greater than "\u003E", ">", //greater than
}; };
/** Special characters which have to be mapped for HTML. */ /** Special characters which have to be mapped for HTML. */
private static final String[] MAPPING4HTML = { private static final String[] MAPPING4HTML = {
"\\", "\", // Backslash "\\", "\", // Backslash
"\u005E", "^", // Caret "\u005E", "^", // Caret
"\u0060", "`", // Accent Grave ` "\u0060", "`", // Accent Grave `
"\u007B", "{", // { "\u007B", "{", // {
"\u007C", "|", // | "\u007C", "|", // |
"\u007D", "}", // } "\u007D", "}", // }
"\u007E", "~", // ~ "\u007E", "~", // ~
"\u0082", "‚", "\u0082", "‚",
"\u0083", "ƒ", "\u0083", "ƒ",
"\u0084", "„", "\u0084", "„",
"\u0085", "…", "\u0085", "…",
"\u0086", "†", "\u0086", "†",
"\u0087", "‡", "\u0087", "‡",
"\u0088", "ˆ", "\u0088", "ˆ",
"\u0089", "‰", "\u0089", "‰",
"\u008A", "Š", "\u008A", "Š",
"\u008B", "‹", "\u008B", "‹",
"\u008C", "Œ", "\u008C", "Œ",
"\u008D", "", "\u008D", "",
"\u008E", "Ž", "\u008E", "Ž",
"\u0091", "‘", "\u0091", "‘",
"\u0092", "’", "\u0092", "’",
"\u0093", "“", "\u0093", "“",
"\u0094", "”", "\u0094", "”",
"\u0095", "•", "\u0095", "•",
"\u0096", "–", "\u0096", "–",
"\u0097", "—", "\u0097", "—",
"\u0098", "˜", "\u0098", "˜",
"\u0099", "™", "\u0099", "™",
"\u009A", "š", "\u009A", "š",
"\u009B", "›", "\u009B", "›",
"\u009C", "œ", "\u009C", "œ",
"\u009D", "", "\u009D", "",
"\u009E", "ž", "\u009E", "ž",
"\u009F", "Ÿ", "\u009F", "Ÿ",
"\u00A1", "¡", //inverted (spanish) exclamation mark "\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent "\u00A2", "¢", //cent
"\u00A3", "£", //pound "\u00A3", "£", //pound
"\u00A4", "¤", //currency "\u00A4", "¤", //currency
"\u00A5", "¥", //yen "\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar "\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign "\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut) "\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign "\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator "\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark "\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign "\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen "\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign "\u00AE", "®", //registered sign
"\u00AF", "¯", //macron "\u00AF", "¯", //macron
"\u00B0", "°", //degree sign "\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign "\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two "\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three "\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent "\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign "\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign "\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot "\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla "\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one "\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator "\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark "\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4 "\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2 "\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4 "\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark "\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À", "\u00C0", "À",
"\u00C1", "Á", "\u00C1", "Á",
"\u00C2", "Â", "\u00C2", "Â",
"\u00C3", "Ã", "\u00C3", "Ã",
"\u00C4", "Ä", "\u00C4", "Ä",
"\u00C5", "Å", "\u00C5", "Å",
"\u00C6", "Æ", "\u00C6", "Æ",
"\u00C7", "Ç", "\u00C7", "Ç",
"\u00C8", "È", "\u00C8", "È",
"\u00C9", "É", "\u00C9", "É",
"\u00CA", "Ê", "\u00CA", "Ê",
"\u00CB", "Ë", "\u00CB", "Ë",
"\u00CC", "Ì", "\u00CC", "Ì",
"\u00CD", "Í", "\u00CD", "Í",
"\u00CE", "Î", "\u00CE", "Î",
"\u00CF", "Ï", "\u00CF", "Ï",
"\u00D0", "Ð", "\u00D0", "Ð",
"\u00D1", "Ñ", "\u00D1", "Ñ",
"\u00D2", "Ò", "\u00D2", "Ò",
"\u00D3", "Ó", "\u00D3", "Ó",
"\u00D4", "Ô", "\u00D4", "Ô",
"\u00D5", "Õ", "\u00D5", "Õ",
"\u00D6", "Ö", "\u00D6", "Ö",
"\u00D7", "×", "\u00D7", "×",
"\u00D8", "Ø", "\u00D8", "Ø",
"\u00D9", "Ù", "\u00D9", "Ù",
"\u00DA", "Ú", "\u00DA", "Ú",
"\u00DB", "Û", "\u00DB", "Û",
"\u00DC", "Ü", "\u00DC", "Ü",
"\u00DD", "Ý", "\u00DD", "Ý",
"\u00DE", "Þ", "\u00DE", "Þ",
"\u00DF", "ß", "\u00DF", "ß",
"\u00E0", "à", "\u00E0", "à",
"\u00E1", "á", "\u00E1", "á",
"\u00E2", "â", "\u00E2", "â",
"\u00E3", "ã", "\u00E3", "ã",
"\u00E4", "ä", "\u00E4", "ä",
"\u00E5", "å", "\u00E5", "å",
"\u00E6", "æ", "\u00E6", "æ",
"\u00E7", "ç", "\u00E7", "ç",
"\u00E8", "è", "\u00E8", "è",
"\u00E9", "é", "\u00E9", "é",
"\u00EA", "ê", "\u00EA", "ê",
"\u00EB", "ë", "\u00EB", "ë",
"\u00EC", "ì", "\u00EC", "ì",
"\u00ED", "í", "\u00ED", "í",
"\u00EE", "î", "\u00EE", "î",
"\u00EF", "ï", "\u00EF", "ï",
"\u00F0", "ð", "\u00F0", "ð",
"\u00F1", "ñ", "\u00F1", "ñ",
"\u00F2", "ò", "\u00F2", "ò",
"\u00F3", "ó", "\u00F3", "ó",
"\u00F4", "ô", "\u00F4", "ô",
"\u00F5", "õ", "\u00F5", "õ",
"\u00F6", "ö", "\u00F6", "ö",
"\u00F7", "÷", "\u00F7", "÷",
"\u00F8", "ø", "\u00F8", "ø",
"\u00F9", "ù", "\u00F9", "ù",
"\u00FA", "ú", "\u00FA", "ú",
"\u00FB", "û", "\u00FB", "û",
"\u00FC", "ü", "\u00FC", "ü",
"\u00FD", "ý", "\u00FD", "ý",
"\u00FE", "þ", "\u00FE", "þ",
"\u00FF", "ÿ" "\u00FF", "ÿ"
}; };
/** Mapping for XML to unicode. */ /** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML = private static final Map<String, String> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2); new HashMap<String, String>();
/** Mapping for HTML to unicode. */ /** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML = private static final Map<String, String> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2); new HashMap<String, String>();
/** Mapping for unicode to XML. */ /** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML = private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */ /** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML = private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
String entity = it.next();
String c = entities.getJSONObject(entity).getString("characters");
entityToChar.put(entity, c);
}
}
static { static {
try {
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
} catch (IOException | JSONException e) {
e.printStackTrace();
}
Character c; Character c;
for (int i = 0; i < MAPPING4HTML.length; i += 2) { for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0)); c = MAPPING4HTML[i].charAt(0);
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]); UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
} }
for (int i = 0; i < MAPPING4XML.length; i += 2) { for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0)); c = MAPPING4XML[i].charAt(0);
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]); UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
} }
} }
@ -220,7 +249,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in XML. * Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in HTML. * Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in * @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML, * HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML} * see defaults/htmlEntities.json
* @return text with replaced characters * @return text with replaced characters
*/ */
private static String unicode2html( private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
int p = 0, p1, q; int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length()); final StringBuilder sb = new StringBuilder(text.length());
String s; String s;
Character r; String r;
while (p < text.length()) { while (p < text.length()) {
p1 = text.indexOf('&', p); p1 = text.indexOf('&', p);
if (p1 < 0) { if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
continue; continue;
} }
if ((r = HTML2UNICODE4XML.get(s)) != null) { if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue()); sb.append(r);
continue; continue;
} }
if ((r = HTML2UNICODE4HTML.get(s)) != null) { if ((r = HTML2UNICODE4HTML.get(s)) != null) {

Loading…
Cancel
Save