*) Second try: replaced replaceHTML again. There should be no problem this time.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1359 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 7d5af75d11
parent d45b1162e2
commit 7d5af75d11
1 changed files with 127 additions and 14 deletions
--- a/source/de/anomic/data/wikiCode.java
+++ b/source/de/anomic/data/wikiCode.java
@ -42,7 +42,7 @@
 // Contributions and changes to the program code must be marked as such.

 // Contains contributions from Alexander Schier [AS]
-// and Marc Nause [MN]
+// Franz Brausse [FB] and Marc Nause [MN]

 package de.anomic.data;

@ -109,21 +109,134 @@ public class wikiCode {
        }
    }

-    public static String replaceHTML(String result) {
-        if (result == null) return null;
-        int p0;
+    //The following method has been submitted by [FB] (added and a few changes by MN)
+    /** Replaces special characters from a string. Otherwise they might cause ugly output on some systems.
+      * This code is also important to avoid XSS attacks.
+      *
+      * @param text a string that possibly contains special characters
+      * @return the string with all special characters encoded so they will look right on every system
+      */
+    public static String replaceHTML(String text) {
+        if (text==null) { return null; }
+        for (int x=0;x<=htmlentities.length-1;x=x+2) {
+            int p=0;
+            while ((p=text.indexOf(htmlentities[x],p))>=0) {
+                text=text.substring(0,p)+htmlentities[x+1]+text.substring(p+htmlentities[x].length());
+                p+=htmlentities[x+1].length();
+            }
+        }
+        return text;
+    }

-        // Ampersands have to be replaced first. If they were replaced later,
+    //This array contains codes (see http://mindprod.com/jgloss/unicode.html for details) and
+    //patterns that will be replaced. To add new codes or patterns, just put them at the end
+    //of the list.
+    public static String[] htmlentities={
+        // Ampersands _have_ to be replaced first. If they were replaced later,
        // other replaced characters containing ampersands would get messed up.
-        p0 = 0; while ((p0 = result.indexOf("&", p0)) >= 0) {result = result.substring(0, p0) + "&amp;" + result.substring(p0 + 1); p0++;}
-        p0 = 0; while ((p0 = result.indexOf('"', p0)) >= 0) result = result.substring(0, p0) + "&quot;" + result.substring(p0 + 1);
-        p0 = 0; while ((p0 = result.indexOf("<", p0)) >= 0) result = result.substring(0, p0) + "&lt;" + result.substring(p0 + 1);
-        p0 = 0; while ((p0 = result.indexOf(">", p0)) >= 0) result = result.substring(0, p0) + "&gt;" + result.substring(p0 + 1);
-        //p0 = 0; while ((p0 = result.indexOf("*", p0)) >= 0) result = result.substring(0, p0) + "&#149;" + result.substring(p0 + 1);
-        p0 = 0; while ((p0 = result.indexOf("(C)", p0)) >= 0) result = result.substring(0, p0) + "&copy;" + result.substring(p0 + 3);
-
-    return result;
-    }
+        "\u0026","&amp;",      //ampersand
+        "\\u0022","&quot;",    //quotation mark
+        "\u003c","&lt;",       //less than
+        "\u003e","&gt;",       //greater than
+        "\u00a1","&iexcl;",    //inverted (spanish) exclamation mark
+        "\u00a2","&cent;",     //cent
+        "\u00a3","&pound;",    //pound
+        "\u00a4","&curren;",   //currency
+        "\u00a5","&yen;",      //yen
+        "\u00a6","&brvbar;",   //broken vertical bar
+        "\u00a7","&sect;",     //section sign
+        "\u00a8","&uml;",      //diaeresis (umlaut)
+        "\u00a9","&copy;",     //copyright sign
+        "\u00aa","&ordf;",     //feminine ordinal indicator
+        "\u00ab","&laquo;",    //left-pointing double angle quotation mark
+        "\u00ac","&not;",      //not sign
+        "\u00ad","&shy;",      //soft hyphen
+        "\u00ae","&reg;",      //registered sign
+        "\u00af","&macr;",     //macron
+        "\u00b0","&deg;",      //degree sign
+        "\u00b1","&plusmn;",   //plus-minus sign
+        "\u00b2","&sup2;",     //superscript two
+        "\u00b3","&sup3;",     //superscript three
+        "\u00b4","&acute;",    //acute accent
+        "\u00b5","&micro;",    //micro sign
+        "\u00b6","&para;",     //paragraph sign
+        "\u00b7","&middot;",   //middle dot
+        "\u00b8","&cedil;",    //cedilla
+        "\u00b9","&sup1;",     //superscript one
+        "\u00ba","&ordm;",     //masculine ordinal indicator
+        "\u00bb","&raquo;",    //right-pointing double angle quotation mark
+        "\u00bc","&frac14;",   //fraction 1/4
+        "\u00bd","&frac12;",   //fraction 1/2
+        "\u00be","&frac34;",   //fraction 3/4
+        "\u00bf","&iquest;",   //inverted (spanisch) questionmark
+        "\u00c0","&Agrave;",
+        "\u00c1","&Aacute;",
+        "\u00c2","&Acirc;",
+        "\u00c3","&Atilde;",
+        "\u00c4","&Auml;",
+        "\u00c5","&Aring;",
+        "\u00c6","&AElig;",
+        "\u00c7","&Ccedil;",
+        "\u00c8","&Egrave;",
+        "\u00c9","&Eacute;",
+        "\u00ca","&Ecirc;",
+        "\u00cb","&Euml;",
+        "\u00cc","&Igrave;",
+        "\u00cd","&Iacute;",
+        "\u00ce","&Icirc;",
+        "\u00cf","&Iuml;",
+        "\u00d0","&ETH;",
+        "\u00d1","&Ntilde;",
+        "\u00d2","&Ograve;",
+        "\u00d3","&Oacute;",
+        "\u00d4","&Ocirc;",
+        "\u00d5","&Otilde;",
+        "\u00d6","&Ouml;",
+        "\u00d7","&times;",
+        "\u00d8","&Oslash;",
+        "\u00d9","&Ugrave;",
+        "\u00da","&Uacute;",
+        "\u00db","&Ucirc;",
+        "\u00dc","&Uuml;",
+        "\u00dd","&Yacute;",
+        "\u00de","&THORN;",
+        "\u00df","&szlig;",
+        "\u00e0","&agrave;",
+        "\u00e1","&aacute;",
+        "\u00e2","&acirc;",
+        "\u00e3","&atilde;",
+        "\u00e4","&auml;",
+        "\u00e5","&aring;",
+        "\u00e6","&aelig;",
+        "\u00e7","&ccedil;",
+        "\u00e8","&egrave;",
+        "\u00e9","&eacute;",
+        "\u00ea","&ecirc;",
+        "\u00eb","&euml;",
+        "\u00ec","&igrave;",
+        "\u00ed","&iacute;",
+        "\u00ee","&icirc;",
+        "\u00ef","&iuml;",
+        "\u00f0","&eth;",
+        "\u00e1","&ntilde;",
+        "\u00e2","&ograve;",
+        "\u00e3","&oacute;",
+        "\u00e4","&ocirc;",
+        "\u00e5","&otilde;",
+        "\u00e6","&ouml;",
+        "\u00e7","&divide;",
+        "\u00e8","&oslash;",
+        "\u00e9","&ugrave;",
+        "\u00ea","&uacute;",
+        "\u00eb","&ucirc;",
+        "\u00ec","&uuml;",
+        "\u00ed","&yacute;",
+        "\u00ee","&thorn;",
+        "\u00ef","&yuml;",
+        "(C)","&copy;"
+    };
+    //end contrib [FB] and [MN]
+

    /** Replaces wiki tags with HTML tags.
      *