Merge pull request #405 from jfhs/jfhs/support-all-html-entities

Improve HTML entities support
pull/408/head
Michael Christen 4 years ago committed by GitHub
commit 42ea2a1c6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -24,7 +24,20 @@
package net.yacy.document.parser.html; package net.yacy.document.parser.html;
import net.yacy.search.Switchboard;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -44,171 +57,187 @@ public final class CharacterCoding {
/** Special characters which have to be mapped for XML. */ /** Special characters which have to be mapped for XML. */
private static final String[] MAPPING4XML = { private static final String[] MAPPING4XML = {
"\"", """, //quotation mark "\"", """, //quotation mark
"\u003C", "<", //less than "\u003C", "<", //less than
"\u003E", ">", //greater than "\u003E", ">", //greater than
}; };
/** Special characters which have to be mapped for HTML. */ /** Special characters which have to be mapped for HTML. */
private static final String[] MAPPING4HTML = { private static final String[] MAPPING4HTML = {
"\\", "\", // Backslash "\\", "\", // Backslash
"\u005E", "^", // Caret "\u005E", "^", // Caret
"\u0060", "`", // Accent Grave ` "\u0060", "`", // Accent Grave `
"\u007B", "{", // { "\u007B", "{", // {
"\u007C", "|", // | "\u007C", "|", // |
"\u007D", "}", // } "\u007D", "}", // }
"\u007E", "~", // ~ "\u007E", "~", // ~
"\u0082", "‚", "\u0082", "‚",
"\u0083", "ƒ", "\u0083", "ƒ",
"\u0084", "„", "\u0084", "„",
"\u0085", "…", "\u0085", "…",
"\u0086", "†", "\u0086", "†",
"\u0087", "‡", "\u0087", "‡",
"\u0088", "ˆ", "\u0088", "ˆ",
"\u0089", "‰", "\u0089", "‰",
"\u008A", "Š", "\u008A", "Š",
"\u008B", "‹", "\u008B", "‹",
"\u008C", "Œ", "\u008C", "Œ",
"\u008D", "", "\u008D", "",
"\u008E", "Ž", "\u008E", "Ž",
"\u0091", "‘", "\u0091", "‘",
"\u0092", "’", "\u0092", "’",
"\u0093", "“", "\u0093", "“",
"\u0094", "”", "\u0094", "”",
"\u0095", "•", "\u0095", "•",
"\u0096", "–", "\u0096", "–",
"\u0097", "—", "\u0097", "—",
"\u0098", "˜", "\u0098", "˜",
"\u0099", "™", "\u0099", "™",
"\u009A", "š", "\u009A", "š",
"\u009B", "›", "\u009B", "›",
"\u009C", "œ", "\u009C", "œ",
"\u009D", "", "\u009D", "",
"\u009E", "ž", "\u009E", "ž",
"\u009F", "Ÿ", "\u009F", "Ÿ",
"\u00A1", "¡", //inverted (spanish) exclamation mark "\u00A1", "¡", //inverted (spanish) exclamation mark
"\u00A2", "¢", //cent "\u00A2", "¢", //cent
"\u00A3", "£", //pound "\u00A3", "£", //pound
"\u00A4", "¤", //currency "\u00A4", "¤", //currency
"\u00A5", "¥", //yen "\u00A5", "¥", //yen
"\u00A6", "¦", //broken vertical bar "\u00A6", "¦", //broken vertical bar
"\u00A7", "§", //section sign "\u00A7", "§", //section sign
"\u00A8", "¨", //diaeresis (umlaut) "\u00A8", "¨", //diaeresis (umlaut)
"\u00A9", "©", //copyright sign "\u00A9", "©", //copyright sign
"\u00AA", "ª", //feminine ordinal indicator "\u00AA", "ª", //feminine ordinal indicator
"\u00AB", "«", //left-pointing double angle quotation mark "\u00AB", "«", //left-pointing double angle quotation mark
"\u00AC", "¬", //not sign "\u00AC", "¬", //not sign
"\u00AD", "­", //soft hyphen "\u00AD", "­", //soft hyphen
"\u00AE", "®", //registered sign "\u00AE", "®", //registered sign
"\u00AF", "¯", //macron "\u00AF", "¯", //macron
"\u00B0", "°", //degree sign "\u00B0", "°", //degree sign
"\u00B1", "±", //plus-minus sign "\u00B1", "±", //plus-minus sign
"\u00B2", "²", //superscript two "\u00B2", "²", //superscript two
"\u00B3", "³", //superscript three "\u00B3", "³", //superscript three
"\u00B4", "´", //acute accent "\u00B4", "´", //acute accent
"\u00B5", "µ", //micro sign "\u00B5", "µ", //micro sign
"\u00B6", "¶", //paragraph sign "\u00B6", "¶", //paragraph sign
"\u00B7", "·", //middle dot "\u00B7", "·", //middle dot
"\u00B8", "¸", //cedilla "\u00B8", "¸", //cedilla
"\u00B9", "¹", //superscript one "\u00B9", "¹", //superscript one
"\u00BA", "º", //masculine ordinal indicator "\u00BA", "º", //masculine ordinal indicator
"\u00BB", "»", //right-pointing double angle quotation mark "\u00BB", "»", //right-pointing double angle quotation mark
"\u00BC", "¼", //fraction 1/4 "\u00BC", "¼", //fraction 1/4
"\u00BD", "½", //fraction 1/2 "\u00BD", "½", //fraction 1/2
"\u00BE", "¾", //fraction 3/4 "\u00BE", "¾", //fraction 3/4
"\u00BF", "¿", //inverted (spanisch) questionmark "\u00BF", "¿", //inverted (spanisch) questionmark
"\u00C0", "À", "\u00C0", "À",
"\u00C1", "Á", "\u00C1", "Á",
"\u00C2", "Â", "\u00C2", "Â",
"\u00C3", "Ã", "\u00C3", "Ã",
"\u00C4", "Ä", "\u00C4", "Ä",
"\u00C5", "Å", "\u00C5", "Å",
"\u00C6", "Æ", "\u00C6", "Æ",
"\u00C7", "Ç", "\u00C7", "Ç",
"\u00C8", "È", "\u00C8", "È",
"\u00C9", "É", "\u00C9", "É",
"\u00CA", "Ê", "\u00CA", "Ê",
"\u00CB", "Ë", "\u00CB", "Ë",
"\u00CC", "Ì", "\u00CC", "Ì",
"\u00CD", "Í", "\u00CD", "Í",
"\u00CE", "Î", "\u00CE", "Î",
"\u00CF", "Ï", "\u00CF", "Ï",
"\u00D0", "Ð", "\u00D0", "Ð",
"\u00D1", "Ñ", "\u00D1", "Ñ",
"\u00D2", "Ò", "\u00D2", "Ò",
"\u00D3", "Ó", "\u00D3", "Ó",
"\u00D4", "Ô", "\u00D4", "Ô",
"\u00D5", "Õ", "\u00D5", "Õ",
"\u00D6", "Ö", "\u00D6", "Ö",
"\u00D7", "×", "\u00D7", "×",
"\u00D8", "Ø", "\u00D8", "Ø",
"\u00D9", "Ù", "\u00D9", "Ù",
"\u00DA", "Ú", "\u00DA", "Ú",
"\u00DB", "Û", "\u00DB", "Û",
"\u00DC", "Ü", "\u00DC", "Ü",
"\u00DD", "Ý", "\u00DD", "Ý",
"\u00DE", "Þ", "\u00DE", "Þ",
"\u00DF", "ß", "\u00DF", "ß",
"\u00E0", "à", "\u00E0", "à",
"\u00E1", "á", "\u00E1", "á",
"\u00E2", "â", "\u00E2", "â",
"\u00E3", "ã", "\u00E3", "ã",
"\u00E4", "ä", "\u00E4", "ä",
"\u00E5", "å", "\u00E5", "å",
"\u00E6", "æ", "\u00E6", "æ",
"\u00E7", "ç", "\u00E7", "ç",
"\u00E8", "è", "\u00E8", "è",
"\u00E9", "é", "\u00E9", "é",
"\u00EA", "ê", "\u00EA", "ê",
"\u00EB", "ë", "\u00EB", "ë",
"\u00EC", "ì", "\u00EC", "ì",
"\u00ED", "í", "\u00ED", "í",
"\u00EE", "î", "\u00EE", "î",
"\u00EF", "ï", "\u00EF", "ï",
"\u00F0", "ð", "\u00F0", "ð",
"\u00F1", "ñ", "\u00F1", "ñ",
"\u00F2", "ò", "\u00F2", "ò",
"\u00F3", "ó", "\u00F3", "ó",
"\u00F4", "ô", "\u00F4", "ô",
"\u00F5", "õ", "\u00F5", "õ",
"\u00F6", "ö", "\u00F6", "ö",
"\u00F7", "÷", "\u00F7", "÷",
"\u00F8", "ø", "\u00F8", "ø",
"\u00F9", "ù", "\u00F9", "ù",
"\u00FA", "ú", "\u00FA", "ú",
"\u00FB", "û", "\u00FB", "û",
"\u00FC", "ü", "\u00FC", "ü",
"\u00FD", "ý", "\u00FD", "ý",
"\u00FE", "þ", "\u00FE", "þ",
"\u00FF", "ÿ" "\u00FF", "ÿ"
}; };
/** Mapping for XML to unicode. */ /** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML = private static final Map<String, String> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2); new HashMap<String, String>();
/** Mapping for HTML to unicode. */ /** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML = private static final Map<String, String> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2); new HashMap<String, String>();
/** Mapping for unicode to XML. */ /** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML = private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */ /** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML = private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
String entity = it.next();
String c = entities.getJSONObject(entity).getString("characters");
entityToChar.put(entity, c);
}
}
static { static {
try {
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
} catch (IOException | JSONException e) {
e.printStackTrace();
}
Character c; Character c;
for (int i = 0; i < MAPPING4HTML.length; i += 2) { for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0)); c = MAPPING4HTML[i].charAt(0);
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]); UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
} }
for (int i = 0; i < MAPPING4XML.length; i += 2) { for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0)); c = MAPPING4XML[i].charAt(0);
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]); UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
} }
} }
@ -220,7 +249,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in XML. * Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in HTML. * Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in * @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML, * HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML} * see defaults/htmlEntities.json
* @return text with replaced characters * @return text with replaced characters
*/ */
private static String unicode2html( private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
int p = 0, p1, q; int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length()); final StringBuilder sb = new StringBuilder(text.length());
String s; String s;
Character r; String r;
while (p < text.length()) { while (p < text.length()) {
p1 = text.indexOf('&', p); p1 = text.indexOf('&', p);
if (p1 < 0) { if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
continue; continue;
} }
if ((r = HTML2UNICODE4XML.get(s)) != null) { if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue()); sb.append(r);
continue; continue;
} }
if ((r = HTML2UNICODE4HTML.get(s)) != null) { if ((r = HTML2UNICODE4HTML.get(s)) != null) {

@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String content = tag.opts.getProperty("content", EMPTY_STRING); final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING); String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
if (name.toLowerCase().equals("generator")) { if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content); this.evaluationScores.match(Element.metagenerator, content);
} }
} }
name = tag.opts.getProperty("http-equiv", EMPTY_STRING); name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
} }
name = tag.opts.getProperty("property", EMPTY_STRING); name = tag.opts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
} }
} else if (tag.name.equalsIgnoreCase("area")) { } else if (tag.name.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING)); final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING); String href = tag.opts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url; AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
if (followDenied()) { if (followDenied()) {

@ -32,6 +32,7 @@ import java.io.Writer;
import java.util.Properties; import java.util.Properties;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.document.parser.html.CharacterCoding;
public final class CharBuffer extends Writer { public final class CharBuffer extends Writer {
@ -444,6 +445,7 @@ public final class CharBuffer extends Writer {
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// doublequotes are obligatory. However, we want to be fuzzy if they // doublequotes are obligatory. However, we want to be fuzzy if they
// are ommittet // are ommittet
String value = null;
if (pos >= this.length) { if (pos >= this.length) {
// error case: input ended too early // error case: input ended too early
break; break;
@ -453,7 +455,7 @@ public final class CharBuffer extends Writer {
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++; while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent doublequote if (pos >= this.length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
pos++; pos++;
} else if (this.buffer[pos] == singlequote) { } else if (this.buffer[pos] == singlequote) {
// search next singlequote // search next singlequote
@ -461,14 +463,15 @@ public final class CharBuffer extends Writer {
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++; while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent singlequote if (pos >= this.length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
pos++; pos++;
} else { } else {
// search next whitespace // search next whitespace
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] > 32)) pos++; while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
} }
p.setProperty(key, CharacterCoding.html2unicode(value));
// pos should point now to a whitespace: eat up spaces // pos should point now to a whitespace: eat up spaces
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// go on with next loop // go on with next loop

Loading…
Cancel
Save