Replace hardcoded html/xml entities with a file, support decoding all defined HTML entities

pull/405/head
jfhs 3 years ago
parent f8cbaeef93
commit 2135d259e3

File diff suppressed because it is too large Load Diff

@ -24,7 +24,20 @@
package net.yacy.document.parser.html;
import net.yacy.search.Switchboard;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
@ -44,171 +57,187 @@ public final class CharacterCoding {
/** Special characters which have to be mapped for XML. */
private static final String[] MAPPING4XML = {
"\"", """, //quotation mark
"\u003C", "<", //less than
"\u003E", ">", //greater than
"\"", """, //quotation mark
"\u003C", "<", //less than
"\u003E", ">", //greater than
};
/** Special characters which have to be mapped for HTML. */
private static final String[] MAPPING4HTML = {
"\\", "\", // Backslash
"\u005E", "^", // Caret
"\\", "\", // Backslash
"\u005E", "^", // Caret
"\u0060", "`", // Accent Grave `
"\u007B", "{", // {
"\u007C", "|", // |
"\u007D", "}", // }
"\u007E", "~", // ~
"\u0060", "`", // Accent Grave `
"\u007B", "{", // {
"\u007C", "|", // |
"\u007D", "}", // }
"\u007E", "~", // ~
"\u0082", "‚",
"\u0083", "ƒ",
"\u0084", "„",
"\u0085", "…",
"\u0086", "†",
"\u0087", "‡",
"\u0088", "ˆ",
"\u0089", "‰",
"\u008A", "Š",
"\u008B", "‹",
"\u008C", "Œ",
"\u008D", "",
"\u008E", "Ž",
"\u0082", "‚",
"\u0083", "ƒ",
"\u0084", "„",
"\u0085", "…",
"\u0086", "†",
"\u0087", "‡",
"\u0088", "ˆ",
"\u0089", "‰",
"\u008A", "Š",
"\u008B", "‹",
"\u008C", "Œ",
"\u008D", "",
"\u008E", "Ž",
"\u0091", "‘",
"\u0092", "’",
"\u0093", "“",
"\u0094", "”",
"\u0095", "•",
"\u0096", "–",
"\u0097", "—",
"\u0098", "˜",
"\u0099", "™",
"\u009A", "š",
"\u009B", "›",
"\u009C", "œ",
"\u009D", "",
"\u009E", "ž",
"\u009F", "Ÿ",
"\u0091", "‘",
"\u0092", "’",
"\u0093", "“",
"\u0094", "”",
"\u0095", "•",
"\u0096", "–",
"\u0097", "—",
"\u0098", "˜",
"\u0099", "™",
"\u009A", "š",
"\u009B", "›",
"\u009C", "œ",
"\u009D", "",
"\u009E", "ž",
"\u009F", "Ÿ",
"\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent
"\u00A3", "£", //pound
"\u00A4", "¤", //currency
"\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign
"\u00AF", "¯", //macron
"\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À",
"\u00C1", "Á",
"\u00C2", "Â",
"\u00C3", "Ã",
"\u00C4", "Ä",
"\u00C5", "Å",
"\u00C6", "Æ",
"\u00C7", "Ç",
"\u00C8", "È",
"\u00C9", "É",
"\u00CA", "Ê",
"\u00CB", "Ë",
"\u00CC", "Ì",
"\u00CD", "Í",
"\u00CE", "Î",
"\u00CF", "Ï",
"\u00D0", "Ð",
"\u00D1", "Ñ",
"\u00D2", "Ò",
"\u00D3", "Ó",
"\u00D4", "Ô",
"\u00D5", "Õ",
"\u00D6", "Ö",
"\u00D7", "×",
"\u00D8", "Ø",
"\u00D9", "Ù",
"\u00DA", "Ú",
"\u00DB", "Û",
"\u00DC", "Ü",
"\u00DD", "Ý",
"\u00DE", "Þ",
"\u00DF", "ß",
"\u00E0", "à",
"\u00E1", "á",
"\u00E2", "â",
"\u00E3", "ã",
"\u00E4", "ä",
"\u00E5", "å",
"\u00E6", "æ",
"\u00E7", "ç",
"\u00E8", "è",
"\u00E9", "é",
"\u00EA", "ê",
"\u00EB", "ë",
"\u00EC", "ì",
"\u00ED", "í",
"\u00EE", "î",
"\u00EF", "ï",
"\u00F0", "ð",
"\u00F1", "ñ",
"\u00F2", "ò",
"\u00F3", "ó",
"\u00F4", "ô",
"\u00F5", "õ",
"\u00F6", "ö",
"\u00F7", "÷",
"\u00F8", "ø",
"\u00F9", "ù",
"\u00FA", "ú",
"\u00FB", "û",
"\u00FC", "ü",
"\u00FD", "ý",
"\u00FE", "þ",
"\u00FF", "ÿ"
"\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent
"\u00A3", "£", //pound
"\u00A4", "¤", //currency
"\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign
"\u00AF", "¯", //macron
"\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À",
"\u00C1", "Á",
"\u00C2", "Â",
"\u00C3", "Ã",
"\u00C4", "Ä",
"\u00C5", "Å",
"\u00C6", "Æ",
"\u00C7", "Ç",
"\u00C8", "È",
"\u00C9", "É",
"\u00CA", "Ê",
"\u00CB", "Ë",
"\u00CC", "Ì",
"\u00CD", "Í",
"\u00CE", "Î",
"\u00CF", "Ï",
"\u00D0", "Ð",
"\u00D1", "Ñ",
"\u00D2", "Ò",
"\u00D3", "Ó",
"\u00D4", "Ô",
"\u00D5", "Õ",
"\u00D6", "Ö",
"\u00D7", "×",
"\u00D8", "Ø",
"\u00D9", "Ù",
"\u00DA", "Ú",
"\u00DB", "Û",
"\u00DC", "Ü",
"\u00DD", "Ý",
"\u00DE", "Þ",
"\u00DF", "ß",
"\u00E0", "à",
"\u00E1", "á",
"\u00E2", "â",
"\u00E3", "ã",
"\u00E4", "ä",
"\u00E5", "å",
"\u00E6", "æ",
"\u00E7", "ç",
"\u00E8", "è",
"\u00E9", "é",
"\u00EA", "ê",
"\u00EB", "ë",
"\u00EC", "ì",
"\u00ED", "í",
"\u00EE", "î",
"\u00EF", "ï",
"\u00F0", "ð",
"\u00F1", "ñ",
"\u00F2", "ò",
"\u00F3", "ó",
"\u00F4", "ô",
"\u00F5", "õ",
"\u00F6", "ö",
"\u00F7", "÷",
"\u00F8", "ø",
"\u00F9", "ù",
"\u00FA", "ú",
"\u00FB", "û",
"\u00FC", "ü",
"\u00FD", "ý",
"\u00FE", "þ",
"\u00FF", "ÿ"
};
/** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2);
private static final Map<String, String> HTML2UNICODE4XML =
new HashMap<String, String>();
/** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2);
private static final Map<String, String> HTML2UNICODE4HTML =
new HashMap<String, String>();
/** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2);
new HashMap<Character, String>(MAPPING4XML.length * 2);
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
String entity = it.next();
String c = entities.getJSONObject(entity).getString("characters");
entityToChar.put(entity, c);
}
}
static {
try {
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
} catch (IOException | JSONException e) {
e.printStackTrace();
}
Character c;
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
c = MAPPING4HTML[i].charAt(0);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
}
for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0));
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
c = MAPPING4XML[i].charAt(0);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
}
}
@ -220,7 +249,6 @@ public final class CharacterCoding {
/**
* Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {
/**
* Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
* @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
* see defaults/htmlEntities.json
* @return text with replaced characters
*/
private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length());
String s;
Character r;
String r;
while (p < text.length()) {
p1 = text.indexOf('&', p);
if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
continue;
}
if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue());
sb.append(r);
continue;
}
if ((r = HTML2UNICODE4HTML.get(s)) != null) {

Loading…
Cancel
Save