You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
278 lines
10 KiB
278 lines
10 KiB
// htmlTools.java
|
|
// -----------------------
|
|
// (C) by Michael Peter Christen; mc@yacy.net,
|
|
// (C) by Jan Sandbrink (NN), Franz Brausse (FB, karlchenofhell),
|
|
// (C) by Bjoern 'fuchs' Krombholz (fuchsi)
|
|
// first published on http://www.yacy.net
|
|
//
|
|
// $LastChangedDate: $
|
|
// $LastChangedRevision: $
|
|
// $LastChangedBy: $
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
|
|
package de.anomic.data;
|
|
|
|
public class htmlTools {
|
|
|
|
/** Replaces characters in a string with other entities according to HTML standards.
|
|
* @param text a string that possibly contains special characters
|
|
* @param includingAmpersand if <code>false</code> ampersands are not encoded
|
|
* @param forXML if <code>true</code> then only &, ", < and > will
|
|
* be transcoded.
|
|
* @return the string with all characters replaced by the corresponding character from array
|
|
*/
|
|
public static String encodeUnicode2html(final String text, final boolean includingAmpersand, final boolean forXML) {
|
|
if (text == null)
|
|
return null;
|
|
|
|
final int spos = (includingAmpersand ? 0 : 2);
|
|
// if (forXML), then only encode ampersand, quotation mark, less than and
|
|
// greather than which are the first 4 pairs in default mapping table
|
|
final int epos = (forXML ? 8 : mapping.length);
|
|
|
|
return encode(text, mapping, spos, epos);
|
|
}
|
|
|
|
/**
|
|
* Like {@link #encodeUnicode2html(String, boolean, boolean)} with <code>forXML = false</code>
|
|
*/
|
|
public static String encodeUnicode2html(final String text, final boolean includingAmpersand) {
|
|
return encodeUnicode2html(text, includingAmpersand, false);
|
|
}
|
|
|
|
|
|
/**
|
|
* Replaces special entities ampersand, quotation marks, and less than/graiter than
|
|
* by the escaping entities allowed in XML documents.
|
|
*
|
|
* Like {@link #encodeUnicode2html(String, boolean, boolean)} with
|
|
* <code>includingAmpersand = true</code> and <code>foxXML = true</code>.
|
|
*
|
|
* @param text the original String
|
|
* @return the encoded String
|
|
*/
|
|
public static String encodeUnicode2xml(final String text) {
|
|
return encodeUnicode2html(text, true, true);
|
|
}
|
|
|
|
/**
|
|
* Generic method that replaces occurences of special character entities defined in map
|
|
* array with their corresponding mapping.
|
|
* @param text The String too process.
|
|
* @param map An array defining the entity mapping.
|
|
* @param spos It is possible to use a subset of the map only. This parameter defines the
|
|
* starting point in the map array.
|
|
* @param epos The ending point, see above.
|
|
* @return A copy of the original String with all entities defined in map replaced.
|
|
*/
|
|
public static String encode(final String text, final String[] map, final int spos, final int epos) {
|
|
final StringBuffer sb = new StringBuffer(text.length());
|
|
int textpos = 0;
|
|
search: while (textpos < text.length()) {
|
|
// find a (forward) mapping
|
|
loop: for (int i = spos; i < epos; i += 2) {
|
|
if (text.charAt(textpos) != map[i].charAt(0)) continue loop;
|
|
// found match
|
|
sb.append(map[i + 1]);
|
|
textpos++;
|
|
continue search;
|
|
}
|
|
// not found match
|
|
sb.append(text.charAt(textpos));
|
|
textpos++;
|
|
}
|
|
|
|
return sb.toString();
|
|
}
|
|
|
|
public static String decodeHtml2Unicode(final String text) {
|
|
if (text == null) return null;
|
|
int pos = 0;
|
|
final StringBuffer sb = new StringBuffer(text.length());
|
|
search: while (pos < text.length()) {
|
|
// find a reverse mapping. TODO: replace matching with hashtable(s)
|
|
loop: for (int i = 0; i < mapping.length; i += 2) {
|
|
if (pos + mapping[i + 1].length() > text.length()) continue loop;
|
|
for (int j = mapping[i + 1].length() - 1; j >= 0; j--) {
|
|
if (text.charAt(pos + j) != mapping[i + 1].charAt(j)) continue loop;
|
|
}
|
|
// found match
|
|
sb.append(mapping[i]);
|
|
pos = pos + mapping[i + 1].length();
|
|
continue search;
|
|
}
|
|
// not found match
|
|
sb.append(text.charAt(pos));
|
|
pos++;
|
|
}
|
|
return new String(sb);
|
|
}
|
|
|
|
//This array contains codes (see http://mindprod.com/jgloss/unicode.html for details)
|
|
//that will be replaced. To add new codes or patterns, just put them at the end
|
|
//of the list. Codes or patterns in this list can not be escaped with [= or <pre>
|
|
private static final String[] mapping = {
|
|
// Ampersands _have_ to be replaced first. If they were replaced later,
|
|
// other replaced characters containing ampersands would get messed up.
|
|
"\u0026","&", //ampersand
|
|
"\"",""", //quotation mark
|
|
"\u003C","<", //less than
|
|
"\u003E",">", //greater than
|
|
"\\", "\", // Backslash
|
|
"\u005E","^", // Caret
|
|
|
|
"\u0060","`", // Accent Grave `
|
|
"\u007B","{", // {
|
|
"\u007C","|", // |
|
|
"\u007D","}", // }
|
|
"\u007E","~", // ~
|
|
|
|
"\u0082","‚",
|
|
"\u0083","ƒ",
|
|
"\u0084","„",
|
|
"\u0085","…",
|
|
"\u0086","†",
|
|
"\u0087","‡",
|
|
"\u0088","ˆ",
|
|
"\u0089","‰",
|
|
"\u008A","Š",
|
|
"\u008B","‹",
|
|
"\u008C","Œ",
|
|
"\u008D","",
|
|
"\u008E","Ž",
|
|
|
|
"\u0091","‘",
|
|
"\u0092","’",
|
|
"\u0093","“",
|
|
"\u0094","”",
|
|
"\u0095","•",
|
|
"\u0096","–",
|
|
"\u0097","—",
|
|
"\u0098","˜",
|
|
"\u0099","™",
|
|
"\u009A","š",
|
|
"\u009B","›",
|
|
"\u009C","œ",
|
|
"\u009D","",
|
|
"\u009E","ž",
|
|
"\u009F","Ÿ",
|
|
|
|
"\u00A1","¡", //inverted (spanish) exclamation mark
|
|
"\u00A2","¢", //cent
|
|
"\u00A3","£", //pound
|
|
"\u00A4","¤", //currency
|
|
"\u00A5","¥", //yen
|
|
"\u00A6","¦", //broken vertical bar
|
|
"\u00A7","§", //section sign
|
|
"\u00A8","¨", //diaeresis (umlaut)
|
|
"\u00A9","©", //copyright sign
|
|
"\u00AA","ª", //feminine ordinal indicator
|
|
"\u00AB","«", //left-pointing double angle quotation mark
|
|
"\u00AC","¬", //not sign
|
|
"\u00AD","­", //soft hyphen
|
|
"\u00AE","®", //registered sign
|
|
"\u00AF","¯", //macron
|
|
"\u00B0","°", //degree sign
|
|
"\u00B1","±", //plus-minus sign
|
|
"\u00B2","²", //superscript two
|
|
"\u00B3","³", //superscript three
|
|
"\u00B4","´", //acute accent
|
|
"\u00B5","µ", //micro sign
|
|
"\u00B6","¶", //paragraph sign
|
|
"\u00B7","·", //middle dot
|
|
"\u00B8","¸", //cedilla
|
|
"\u00B9","¹", //superscript one
|
|
"\u00BA","º", //masculine ordinal indicator
|
|
"\u00BB","»", //right-pointing double angle quotation mark
|
|
"\u00BC","¼", //fraction 1/4
|
|
"\u00BD","½", //fraction 1/2
|
|
"\u00BE","¾", //fraction 3/4
|
|
"\u00BF","¿", //inverted (spanisch) questionmark
|
|
"\u00C0","À",
|
|
"\u00C1","Á",
|
|
"\u00C2","Â",
|
|
"\u00C3","Ã",
|
|
"\u00C4","Ä",
|
|
"\u00C5","Å",
|
|
"\u00C6","Æ",
|
|
"\u00C7","Ç",
|
|
"\u00C8","È",
|
|
"\u00C9","É",
|
|
"\u00CA","Ê",
|
|
"\u00CB","Ë",
|
|
"\u00CC","Ì",
|
|
"\u00CD","Í",
|
|
"\u00CE","Î",
|
|
"\u00CF","Ï",
|
|
"\u00D0","Ð",
|
|
"\u00D1","Ñ",
|
|
"\u00D2","Ò",
|
|
"\u00D3","Ó",
|
|
"\u00D4","Ô",
|
|
"\u00D5","Õ",
|
|
"\u00D6","Ö",
|
|
"\u00D7","×",
|
|
"\u00D8","Ø",
|
|
"\u00D9","Ù",
|
|
"\u00DA","Ú",
|
|
"\u00DB","Û",
|
|
"\u00DC","Ü",
|
|
"\u00DD","Ý",
|
|
"\u00DE","Þ",
|
|
"\u00DF","ß",
|
|
"\u00E0","à",
|
|
"\u00E1","á",
|
|
"\u00E2","â",
|
|
"\u00E3","ã",
|
|
"\u00E4","ä",
|
|
"\u00E5","å",
|
|
"\u00E6","æ",
|
|
"\u00E7","ç",
|
|
"\u00E8","è",
|
|
"\u00E9","é",
|
|
"\u00EA","ê",
|
|
"\u00EB","ë",
|
|
"\u00EC","ì",
|
|
"\u00ED","í",
|
|
"\u00EE","î",
|
|
"\u00EF","ï",
|
|
"\u00F0","ð",
|
|
"\u00F1","ñ",
|
|
"\u00F2","ò",
|
|
"\u00F3","ó",
|
|
"\u00F4","ô",
|
|
"\u00F5","õ",
|
|
"\u00F6","ö",
|
|
"\u00F7","÷",
|
|
"\u00F8","ø",
|
|
"\u00F9","ù",
|
|
"\u00FA","ú",
|
|
"\u00FB","û",
|
|
"\u00FC","ü",
|
|
"\u00FD","ý",
|
|
"\u00FE","þ",
|
|
"\u00FF","ÿ"
|
|
};
|
|
|
|
public static void main(final String[] args) {
|
|
final String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen";
|
|
final String txet = encodeUnicode2html(text, true);
|
|
System.out.println(txet);
|
|
System.out.println(decodeHtml2Unicode(txet));
|
|
if (decodeHtml2Unicode(txet).equals(text)) System.out.println("correct");
|
|
}
|
|
}
|