Merge pull request #405 from jfhs/jfhs/support-all-html-entities

Improve HTML entities support
pull/408/head
Michael Christen 4 years ago committed by GitHub
commit 42ea2a1c6f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

File diff suppressed because it is too large Load Diff

@ -24,7 +24,20 @@
package net.yacy.document.parser.html; package net.yacy.document.parser.html;
import net.yacy.search.Switchboard;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.HashMap; import java.util.HashMap;
import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -188,27 +201,43 @@ public final class CharacterCoding {
}; };
/** Mapping for XML to unicode. */ /** Mapping for XML to unicode. */
private static final Map<String, Character> HTML2UNICODE4XML = private static final Map<String, String> HTML2UNICODE4XML =
new HashMap<String, Character>(MAPPING4XML.length * 2); new HashMap<String, String>();
/** Mapping for HTML to unicode. */ /** Mapping for HTML to unicode. */
private static final Map<String, Character> HTML2UNICODE4HTML = private static final Map<String, String> HTML2UNICODE4HTML =
new HashMap<String, Character>(MAPPING4HTML.length * 2); new HashMap<String, String>();
/** Mapping for unicode to XML. */ /** Mapping for unicode to XML. */
private static final Map<Character, String> UNICODE2HTML4XML = private static final Map<Character, String> UNICODE2HTML4XML =
new HashMap<Character, String>(MAPPING4XML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
/** Mapping for unicode to HTML. */ /** Mapping for unicode to HTML. */
private static final Map<Character, String> UNICODE2HTML4HTML = private static final Map<Character, String> UNICODE2HTML4HTML =
new HashMap<Character, String>(MAPPING4HTML.length * 2); new HashMap<Character, String>(MAPPING4XML.length * 2);
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
String entity = it.next();
String c = entities.getJSONObject(entity).getString("characters");
entityToChar.put(entity, c);
}
}
static { static {
try {
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
} catch (IOException | JSONException e) {
e.printStackTrace();
}
Character c; Character c;
for (int i = 0; i < MAPPING4HTML.length; i += 2) { for (int i = 0; i < MAPPING4HTML.length; i += 2) {
c = Character.valueOf(MAPPING4HTML[i].charAt(0)); c = MAPPING4HTML[i].charAt(0);
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]); UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
} }
for (int i = 0; i < MAPPING4XML.length; i += 2) { for (int i = 0; i < MAPPING4XML.length; i += 2) {
c = Character.valueOf(MAPPING4XML[i].charAt(0)); c = MAPPING4XML[i].charAt(0);
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]); UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
} }
} }
@ -220,7 +249,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in XML. * Replaces characters which have special representation in XML.
* @see #MAPPING4XML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -231,7 +259,6 @@ public final class CharacterCoding {
/** /**
* Replaces characters which have special representation in HTML. * Replaces characters which have special representation in HTML.
* @see #MAPPING4HTML
* @param text text with character to replace * @param text text with character to replace
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @return text with replaced characters * @return text with replaced characters
@ -246,7 +273,7 @@ public final class CharacterCoding {
* @param amp true if ampersands shall be replaced, else false * @param amp true if ampersands shall be replaced, else false
* @param html true if characters shall be replaced for embedding in * @param html true if characters shall be replaced for embedding in
* HTML, false for XML (far more characters are replaced for HTML, * HTML, false for XML (far more characters are replaced for HTML,
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML} * see defaults/htmlEntities.json
* @return text with replaced characters * @return text with replaced characters
*/ */
private static String unicode2html( private static String unicode2html(
@ -291,7 +318,7 @@ public final class CharacterCoding {
int p = 0, p1, q; int p = 0, p1, q;
final StringBuilder sb = new StringBuilder(text.length()); final StringBuilder sb = new StringBuilder(text.length());
String s; String s;
Character r; String r;
while (p < text.length()) { while (p < text.length()) {
p1 = text.indexOf('&', p); p1 = text.indexOf('&', p);
if (p1 < 0) { if (p1 < 0) {
@ -328,7 +355,7 @@ public final class CharacterCoding {
continue; continue;
} }
if ((r = HTML2UNICODE4XML.get(s)) != null) { if ((r = HTML2UNICODE4XML.get(s)) != null) {
sb.append(r.charValue()); sb.append(r);
continue; continue;
} }
if ((r = HTML2UNICODE4HTML.get(s)) != null) { if ((r = HTML2UNICODE4HTML.get(s)) != null) {

@ -769,18 +769,18 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String content = tag.opts.getProperty("content", EMPTY_STRING); final String content = tag.opts.getProperty("content", EMPTY_STRING);
String name = tag.opts.getProperty("name", EMPTY_STRING); String name = tag.opts.getProperty("name", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
if (name.toLowerCase().equals("generator")) { if (name.toLowerCase().equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content); this.evaluationScores.match(Element.metagenerator, content);
} }
} }
name = tag.opts.getProperty("http-equiv", EMPTY_STRING); name = tag.opts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
} }
name = tag.opts.getProperty("property", EMPTY_STRING); name = tag.opts.getProperty("property", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), content);
} }
} else if (tag.name.equalsIgnoreCase("area")) { } else if (tag.name.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING)); final String areatitle = cleanLine(tag.opts.getProperty("title", EMPTY_STRING));
@ -904,7 +904,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tag.tagname=" + tag.tagname + ", opts=" + tag.opts.toString() + ", text=" + UTF8.String(text));
if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) { if (tag.name.equalsIgnoreCase("a") && tag.content.length() < 2048) {
String href = tag.opts.getProperty("href", EMPTY_STRING); String href = tag.opts.getProperty("href", EMPTY_STRING);
href = CharacterCoding.html2unicode(href);
AnchorURL url; AnchorURL url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
if (followDenied()) { if (followDenied()) {

@ -32,6 +32,7 @@ import java.io.Writer;
import java.util.Properties; import java.util.Properties;
import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.encoding.UTF8;
import net.yacy.document.parser.html.CharacterCoding;
public final class CharBuffer extends Writer { public final class CharBuffer extends Writer {
@ -444,6 +445,7 @@ public final class CharBuffer extends Writer {
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// doublequotes are obligatory. However, we want to be fuzzy if they // doublequotes are obligatory. However, we want to be fuzzy if they
// are ommittet // are ommittet
String value = null;
if (pos >= this.length) { if (pos >= this.length) {
// error case: input ended too early // error case: input ended too early
break; break;
@ -453,7 +455,7 @@ public final class CharBuffer extends Writer {
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++; while ((pos < this.length) && (this.buffer[pos] != doublequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent doublequote if (pos >= this.length) break; // this is the case if we found no parent doublequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
pos++; pos++;
} else if (this.buffer[pos] == singlequote) { } else if (this.buffer[pos] == singlequote) {
// search next singlequote // search next singlequote
@ -461,14 +463,15 @@ public final class CharBuffer extends Writer {
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++; while ((pos < this.length) && (this.buffer[pos] != singlequote)) pos++;
if (pos >= this.length) break; // this is the case if we found no parent singlequote if (pos >= this.length) break; // this is the case if we found no parent singlequote
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
pos++; pos++;
} else { } else {
// search next whitespace // search next whitespace
start = pos; start = pos;
while ((pos < this.length) && (this.buffer[pos] > 32)) pos++; while ((pos < this.length) && (this.buffer[pos] > 32)) pos++;
p.setProperty(key, new String(this.buffer, start, pos - start).trim()); value = new String(this.buffer, start, pos - start).trim();
} }
p.setProperty(key, CharacterCoding.html2unicode(value));
// pos should point now to a whitespace: eat up spaces // pos should point now to a whitespace: eat up spaces
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++; while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
// go on with next loop // go on with next loop

Loading…
Cancel
Save