// htmlTools.java
// -----------------------
// (C) by Michael Peter Christen; mc@anomic.de,
// (C) by Jan Sandbrink (NN), Franz Brausse (FB, karlchenofhell),
// (C) by Bjoern 'fuchs' Krombholz (fuchsi)
// first published on http://www.yacy.net
//
// $LastChangedDate: $
// $LastChangedRevision: $
// $LastChangedBy: $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.data;
public class htmlTools {
/** Replaces characters in a string with other entities according to HTML standards.
* @param text a string that possibly contains special characters
* @param includingAmpersand if false
ampersands are not encoded
* @param forXML if true
then only &, ", < and > will
* be transcoded.
* @return the string with all characters replaced by the corresponding character from array
*/
public static String encodeUnicode2html(String text, boolean includingAmpersand, boolean forXML) {
if (text == null)
return null;
int spos = (includingAmpersand ? 0 : 2);
// if (forXML), then only encode ampersand, quotation mark, less than and
// greather than which are the first 4 pairs in default mapping table
int epos = (forXML ? 8 : mapping.length);
return encode(text, mapping, spos, epos);
}
/**
* Like {@link #encodeUnicode2html(String, boolean, boolean)} with forXML = false
*/
public static String encodeUnicode2html(String text, boolean includingAmpersand) {
return encodeUnicode2html(text, includingAmpersand, false);
}
/**
* Replaces special entities ampersand, quotation marks, and less than/graiter than
* by the escaping entities allowed in XML documents.
*
* Like {@link #encodeUnicode2html(String, boolean, boolean)} with
* includingAmpersand = true
and foxXML = true
.
*
* @param text the original String
* @return the encoded String
*/
public static String encodeUnicode2xml(String text) {
return encodeUnicode2html(text, true, true);
}
/**
* Generic method that replaces occurences of special character entities defined in map
* array with their corresponding mapping.
* @param text The String too process.
* @param map An array defining the entity mapping.
* @param spos It is possible to use a subset of the map only. This parameter defines the
* starting point in the map array.
* @param epos The ending point, see above.
* @return A copy of the original String with all entities defined in map replaced.
*/
public static String encode(String text, final String[] map, int spos, int epos) {
StringBuffer sb = new StringBuffer(text.length());
int textpos = 0;
search: while (textpos < text.length()) {
// find a (forward) mapping
loop: for (int i = spos; i < epos; i += 2) {
if (text.charAt(textpos) != map[i].charAt(0)) continue loop;
// found match
sb.append(map[i + 1]);
textpos++;
continue search;
}
// not found match
sb.append(text.charAt(textpos));
textpos++;
}
return sb.toString();
}
public static String decodeHtml2Unicode(String text) {
if (text == null) return null;
int pos = 0;
StringBuffer sb = new StringBuffer(text.length());
search: while (pos < text.length()) {
// find a reverse mapping. TODO: replace matching with hashtable(s)
loop: for (int i = 0; i < mapping.length; i += 2) {
if (pos + mapping[i + 1].length() > text.length()) continue loop;
for (int j = mapping[i + 1].length() - 1; j >= 0; j--) {
if (text.charAt(pos + j) != mapping[i + 1].charAt(j)) continue loop;
}
// found match
sb.append(mapping[i]);
pos = pos + mapping[i + 1].length();
continue search;
}
// not found match
sb.append(text.charAt(pos));
pos++;
}
return new String(sb);
}
//This array contains codes (see http://mindprod.com/jgloss/unicode.html for details)
//that will be replaced. To add new codes or patterns, just put them at the end
//of the list. Codes or patterns in this list can not be escaped with [= or
private static final String[] mapping = { // Ampersands _have_ to be replaced first. If they were replaced later, // other replaced characters containing ampersands would get messed up. "\u0026","&", //ampersand "\"",""", //quotation mark "\u003C","<", //less than "\u003E",">", //greater than "\\", "\", // Backslash "\u005E","^", // Caret "\u0060","`", // Accent Grave ` "\u007B","{", // { "\u007C","|", // | "\u007D","}", // } "\u007E","~", // ~ "\u0082","", "\u0083","", "\u0084","", "\u0085"," ", "\u0086","", "\u0087","", "\u0088","", "\u0089","", "\u008A","", "\u008B","", "\u008C","", "\u008D","", "\u008E","", "\u0091","", "\u0092","", "\u0093","", "\u0094","", "\u0095","", "\u0096","", "\u0097","", "\u0098","", "\u0099","", "\u009A","", "\u009B","", "\u009C","", "\u009D","", "\u009E","", "\u009F","", "\u00A1","¡", //inverted (spanish) exclamation mark "\u00A2","¢", //cent "\u00A3","£", //pound "\u00A4","¤", //currency "\u00A5","¥", //yen "\u00A6","¦", //broken vertical bar "\u00A7","§", //section sign "\u00A8","¨", //diaeresis (umlaut) "\u00A9","©", //copyright sign "\u00AA","ª", //feminine ordinal indicator "\u00AB","«", //left-pointing double angle quotation mark "\u00AC","¬", //not sign "\u00AD","", //soft hyphen "\u00AE","®", //registered sign "\u00AF","¯", //macron "\u00B0","°", //degree sign "\u00B1","±", //plus-minus sign "\u00B2","²", //superscript two "\u00B3","³", //superscript three "\u00B4","´", //acute accent "\u00B5","µ", //micro sign "\u00B6","¶", //paragraph sign "\u00B7","·", //middle dot "\u00B8","¸", //cedilla "\u00B9","¹", //superscript one "\u00BA","º", //masculine ordinal indicator "\u00BB","»", //right-pointing double angle quotation mark "\u00BC","¼", //fraction 1/4 "\u00BD","½", //fraction 1/2 "\u00BE","¾", //fraction 3/4 "\u00BF","¿", //inverted (spanisch) questionmark "\u00C0","À", "\u00C1","Á", "\u00C2","Â", "\u00C3","Ã", "\u00C4","Ä", "\u00C5","Å", "\u00C6","Æ", "\u00C7","Ç", "\u00C8","È", "\u00C9","É", "\u00CA","Ê", "\u00CB","Ë", "\u00CC","Ì", "\u00CD","Í", "\u00CE","Î", "\u00CF","Ï", "\u00D0","Ð", "\u00D1","Ñ", "\u00D2","Ò", "\u00D3","Ó", "\u00D4","Ô", "\u00D5","Õ", "\u00D6","Ö", "\u00D7","×", "\u00D8","Ø", "\u00D9","Ù", "\u00DA","Ú", "\u00DB","Û", "\u00DC","Ü", "\u00DD","Ý", "\u00DE","Þ", "\u00DF","ß", "\u00E0","à", "\u00E1","á", "\u00E2","â", "\u00E3","ã", "\u00E4","ä", "\u00E5","å", "\u00E6","æ", "\u00E7","ç", "\u00E8","è", "\u00E9","é", "\u00EA","ê", "\u00EB","ë", "\u00EC","ì", "\u00ED","í", "\u00EE","î", "\u00EF","ï", "\u00F0","ð", "\u00F1","ñ", "\u00F2","ò", "\u00F3","ó", "\u00F4","ô", "\u00F5","õ", "\u00F6","ö", "\u00F7","÷", "\u00F8","ø", "\u00F9","ù", "\u00FA","ú", "\u00FB","û", "\u00FC","ü", "\u00FD","ý", "\u00FE","þ", "\u00FF","ÿ" }; public static void main(String[] args) { String text = "Test-Text mit & um zyklische ü & Ersetzungen auszuschliessen"; String txet = encodeUnicode2html(text, true); System.out.println(txet); System.out.println(decodeHtml2Unicode(txet)); if (decodeHtml2Unicode(txet).equals(text)) System.out.println("correct"); } }