|
|
@ -24,7 +24,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
package net.yacy.document.parser.html;
|
|
|
|
package net.yacy.document.parser.html;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
|
|
|
|
import org.json.JSONArray;
|
|
|
|
|
|
|
|
import org.json.JSONException;
|
|
|
|
|
|
|
|
import org.json.JSONObject;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import java.io.BufferedReader;
|
|
|
|
|
|
|
|
import java.io.File;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
|
|
|
|
import java.nio.charset.Charset;
|
|
|
|
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
|
|
|
|
import java.nio.file.Files;
|
|
|
|
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
|
|
|
import java.util.Iterator;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
|
|
|
|
|
|
|
@ -188,27 +201,43 @@ public final class CharacterCoding {
|
|
|
|
};
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/** Mapping for XML to unicode. */
|
|
|
|
/** Mapping for XML to unicode. */
|
|
|
|
private static final Map<String, Character> HTML2UNICODE4XML =
|
|
|
|
private static final Map<String, String> HTML2UNICODE4XML =
|
|
|
|
new HashMap<String, Character>(MAPPING4XML.length * 2);
|
|
|
|
new HashMap<String, String>();
|
|
|
|
/** Mapping for HTML to unicode. */
|
|
|
|
/** Mapping for HTML to unicode. */
|
|
|
|
private static final Map<String, Character> HTML2UNICODE4HTML =
|
|
|
|
private static final Map<String, String> HTML2UNICODE4HTML =
|
|
|
|
new HashMap<String, Character>(MAPPING4HTML.length * 2);
|
|
|
|
new HashMap<String, String>();
|
|
|
|
/** Mapping for unicode to XML. */
|
|
|
|
/** Mapping for unicode to XML. */
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4XML =
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4XML =
|
|
|
|
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
|
|
|
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
|
|
|
/** Mapping for unicode to HTML. */
|
|
|
|
/** Mapping for unicode to HTML. */
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4HTML =
|
|
|
|
private static final Map<Character, String> UNICODE2HTML4HTML =
|
|
|
|
new HashMap<Character, String>(MAPPING4HTML.length * 2);
|
|
|
|
new HashMap<Character, String>(MAPPING4XML.length * 2);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static void parseJsonEntities(JSONObject entities, Map<String, String> entityToChar) throws JSONException {
|
|
|
|
|
|
|
|
for (Iterator<String> it = entities.keys(); it.hasNext(); ) {
|
|
|
|
|
|
|
|
String entity = it.next();
|
|
|
|
|
|
|
|
String c = entities.getJSONObject(entity).getString("characters");
|
|
|
|
|
|
|
|
entityToChar.put(entity, c);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static {
|
|
|
|
static {
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
|
|
|
byte[] encoded = Files.readAllBytes(Paths.get(Switchboard.getSwitchboard() != null ? Switchboard.getSwitchboard().appPath.getAbsolutePath() : ".", "defaults", "htmlEntities.json"));
|
|
|
|
|
|
|
|
JSONObject json = new JSONObject(new String(encoded, StandardCharsets.UTF_8));
|
|
|
|
|
|
|
|
parseJsonEntities(json.getJSONObject("xml"), HTML2UNICODE4XML);
|
|
|
|
|
|
|
|
parseJsonEntities(json.getJSONObject("html4"), HTML2UNICODE4HTML);
|
|
|
|
|
|
|
|
parseJsonEntities(json.getJSONObject("html5"), HTML2UNICODE4HTML);
|
|
|
|
|
|
|
|
} catch (IOException | JSONException e) {
|
|
|
|
|
|
|
|
e.printStackTrace();
|
|
|
|
|
|
|
|
}
|
|
|
|
Character c;
|
|
|
|
Character c;
|
|
|
|
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
|
|
|
|
for (int i = 0; i < MAPPING4HTML.length; i += 2) {
|
|
|
|
c = Character.valueOf(MAPPING4HTML[i].charAt(0));
|
|
|
|
c = MAPPING4HTML[i].charAt(0);
|
|
|
|
HTML2UNICODE4HTML.put(MAPPING4HTML[i + 1], c);
|
|
|
|
|
|
|
|
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
|
|
|
|
UNICODE2HTML4HTML.put(c, MAPPING4HTML[i + 1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int i = 0; i < MAPPING4XML.length; i += 2) {
|
|
|
|
for (int i = 0; i < MAPPING4XML.length; i += 2) {
|
|
|
|
c = Character.valueOf(MAPPING4XML[i].charAt(0));
|
|
|
|
c = MAPPING4XML[i].charAt(0);
|
|
|
|
HTML2UNICODE4XML.put(MAPPING4XML[i + 1], c);
|
|
|
|
|
|
|
|
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
|
|
|
|
UNICODE2HTML4XML.put(c, MAPPING4XML[i + 1]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
@ -220,7 +249,6 @@ public final class CharacterCoding {
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* Replaces characters which have special representation in XML.
|
|
|
|
* Replaces characters which have special representation in XML.
|
|
|
|
* @see #MAPPING4XML
|
|
|
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
* @param text text with character to replace
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @return text with replaced characters
|
|
|
|
* @return text with replaced characters
|
|
|
@ -231,7 +259,6 @@ public final class CharacterCoding {
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
/**
|
|
|
|
* Replaces characters which have special representation in HTML.
|
|
|
|
* Replaces characters which have special representation in HTML.
|
|
|
|
* @see #MAPPING4HTML
|
|
|
|
|
|
|
|
* @param text text with character to replace
|
|
|
|
* @param text text with character to replace
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @return text with replaced characters
|
|
|
|
* @return text with replaced characters
|
|
|
@ -246,7 +273,7 @@ public final class CharacterCoding {
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @param amp true if ampersands shall be replaced, else false
|
|
|
|
* @param html true if characters shall be replaced for embedding in
|
|
|
|
* @param html true if characters shall be replaced for embedding in
|
|
|
|
* HTML, false for XML (far more characters are replaced for HTML,
|
|
|
|
* HTML, false for XML (far more characters are replaced for HTML,
|
|
|
|
* compare {@link #MAPPING4HTML} with {@link #MAPPING4XML}
|
|
|
|
* see defaults/htmlEntities.json
|
|
|
|
* @return text with replaced characters
|
|
|
|
* @return text with replaced characters
|
|
|
|
*/
|
|
|
|
*/
|
|
|
|
private static String unicode2html(
|
|
|
|
private static String unicode2html(
|
|
|
@ -291,7 +318,7 @@ public final class CharacterCoding {
|
|
|
|
int p = 0, p1, q;
|
|
|
|
int p = 0, p1, q;
|
|
|
|
final StringBuilder sb = new StringBuilder(text.length());
|
|
|
|
final StringBuilder sb = new StringBuilder(text.length());
|
|
|
|
String s;
|
|
|
|
String s;
|
|
|
|
Character r;
|
|
|
|
String r;
|
|
|
|
while (p < text.length()) {
|
|
|
|
while (p < text.length()) {
|
|
|
|
p1 = text.indexOf('&', p);
|
|
|
|
p1 = text.indexOf('&', p);
|
|
|
|
if (p1 < 0) {
|
|
|
|
if (p1 < 0) {
|
|
|
@ -328,7 +355,7 @@ public final class CharacterCoding {
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((r = HTML2UNICODE4XML.get(s)) != null) {
|
|
|
|
if ((r = HTML2UNICODE4XML.get(s)) != null) {
|
|
|
|
sb.append(r.charValue());
|
|
|
|
sb.append(r);
|
|
|
|
continue;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if ((r = HTML2UNICODE4HTML.get(s)) != null) {
|
|
|
|
if ((r = HTML2UNICODE4HTML.get(s)) != null) {
|
|
|
|