- reverted parseArg(String) to use a byte-array to handle correct UTF-8 parsing

- arguments aren't passed html-escaped to the servlets anymore, bug-fix for http://www.yacy-forum.de/viewtopic.php?p=30573 git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3321 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · e68cdeeeb3
parent e00e850a98
commit e68cdeeeb3
3 changed files with 66 additions and 33 deletions
--- a/htroot/index.html
+++ b/htroot/index.html
@ -17,7 +17,7 @@
      #[promoteSearchPageGreeting]#
    </h2>
  
-    <form class="search" action="yacysearch.html" method="get" name="searchform" accept-charset="ascii">
+    <form class="search" action="yacysearch.html" method="get" name="searchform" accept-charset="UTF-8">
      <fieldset class="maininput">
        <input type="hidden" name="display" value="#[display]#" />
        <input name="search" id="search" type="text" size="52" maxlength="80" value="#[former]#" />
@ -28,13 +28,13 @@
        <input type="radio" id="audio" name="contentdom" value="audio" #(contentdomCheckAudio)#::checked="checked"#(/contentdomCheckAudio)# /><label for="audio">Audio</label>&nbsp;&nbsp;
        <input type="radio" id="video" name="contentdom" value="video" #(contentdomCheckVideo)#::checked="checked"#(/contentdomCheckVideo)# /><label for="video">Video</label>&nbsp;&nbsp;
        <input type="radio" id="app" name="contentdom" value="app" #(contentdomCheckApp)#::checked="checked"#(/contentdomCheckApp)# /><label for="app">Applications</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-      #(searchoptions)#
+      #(searchoptions)#<!-- default values are hard-coded
        <input type="hidden" name="count" value="10" />
        <input type="hidden" name="resource" value="global" />
        <input type="hidden" name="time" value="6" />
        <input type="hidden" name="urlmaskfilter" value=".*" />
        <input type="hidden" name="prefermaskfilter" value="" />
-        <input type="hidden" name="indexof" value="off" />
+        <input type="hidden" name="indexof" value="off" />-->
      </fieldset>
      <p><a href="/index.html?searchoptions=1&amp;display=#[display]#" onclick="this.href='/index.html?searchoptions=1&amp;display=#[display]#&amp;handover='+document.searchform.search.value">more options...</a></p>
      ::
--- a/htroot/yacysearch.java
+++ b/htroot/yacysearch.java
@ -98,6 +98,7 @@ public class yacysearch {
        // case if no values are requested
        final String referer = (String) header.get("Referer");
        String querystring = (post == null) ? "" : post.get("search", "").trim();
+        
        if ((post == null) || (env == null) || (querystring.length() == 0)) {

            // save referrer
@ -147,6 +148,7 @@ public class yacysearch {

        // collect search attributes
        int maxDistance = Integer.MAX_VALUE;
+        
        if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
            querystring = querystring.substring(1, querystring.length() - 1).trim();
            maxDistance = 1;
@ -459,7 +461,7 @@ public class yacysearch {
            prop.put("type_results_" + i + "_authorized", (authenticated) ? 1 : 0);

        prop.putASIS("promoteSearchPageGreeting", promoteSearchPageGreeting);
-        prop.put("former", wikiCode.replaceXMLEntities(post.get("search", "")));
+        prop.put("former", post.get("search", ""));
        prop.put("count", count);
        prop.put("order", order);
        prop.put("resource", (global) ? "global" : "local");
--- a/source/de/anomic/http/httpd.java
+++ b/source/de/anomic/http/httpd.java
@ -50,9 +50,13 @@ import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
 import java.lang.reflect.Constructor;
 import java.net.InetAddress;
 import java.net.MalformedURLException;
+import java.net.URLDecoder;
+import java.net.URLEncoder;
+import java.nio.charset.Charset;
 import java.util.Arrays;
 import java.util.Date;
 import java.util.HashMap;
@ -747,7 +751,7 @@ public final class httpd implements serverHandler {
            sep = argsString.indexOf("&");
            if ((eqp <= 0) || (sep <= 0)) break;
            // resulting equations are inserted into the property args with leading '&'
-            args.put(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
+            args.putASIS(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
            argsString = argsString.substring(sep + 1);
            argc++;
        }
@ -755,45 +759,72 @@ public final class httpd implements serverHandler {
        return argc;
    }
    
-    // 16.12.2006 by [FB]: added UTF-character decoding
+    /**
+     * <p>This method basically does the same as {@link URLDecoder#decode(String, String) URLDecoder.decode(s, "UTF-8")}
+     * would do with the exception of more lazyness in regard to current browser implementations as they do not
+     * always comply with the standards.</p>
+     * <p>The following replacements are performed on the input-<code>String</code>:</p>
+     * <ul>
+     * <li>'<code>+</code>'-characters are replaced by space
+     * <li>(supbsequent (in the case of encoded unicode-chars)) '<code>%HH</code>'-entities are replaced by their
+     * respective <code>char</code>-representation</li>
+     * <li>'<code>%uHHHH</code>'-entities (sent by IE although rejected by the W3C) are replaced by their respective
+     * <code>char</code>-representation</li>
+     * <li><strong>TODO</strong>: <code>chars</code> already encoded in UTF-8 are url-encoded and re-decoded due to internal restrictions,
+     * which slows down this method unnecessarily</li>
+     * </ul>
+     * 
+     * @param s the URL-encoded <code>String</code> to decode, note that the encoding used to URL-encode the original
+     * <code>String</code> has to be UTF-8 (i.e. the "<code>accept-charset</code>"-property of HTML
+     * <code>&lt;form&gt;</code>-elements)
+     * @return the "normal" Java-<code>String</code> (UTF-8) represented by the input or <code>null</code>
+     * if the passed argument <code>encoding</code> is not supported
+     */
    private static String parseArg(String s) {
-        // this parses a given value-string from a http property
-        // we replace all "+" by spaces
-        // and resolve %-escapes with two- / four-digit hex attributes
        int pos = 0;
-        CharArrayWriter baos = new CharArrayWriter(s.length());
+        ByteArrayOutputStream baos = new ByteArrayOutputStream(s.length());
+        
        while (pos < s.length()) {
            if (s.charAt(pos) == '+') {
                baos.write(' ');
                pos++;
            } else if (s.charAt(pos) == '%') {
-            	// start change by [FB]: added UTF-escapes
-            	if (s.charAt(pos + 1) == 'u') {				// UTF-16 escape
-            		baos.write(Integer.parseInt(s.substring(pos + 2, pos + 6), 16));
-            		pos += 6;
-            	} else {									// normal escape
-                    // currently one escape is treated as one char, which only works if
-                    // formulars accept-charset=ascii are used. This is wrong if
-                    // formulars are set to accept-charset=UTF-8, then many escaped bytes
-                    // are sent, but they belong together to one char
-                    
-                    // TODO: UTF-8 escapes: i.e. "%C3%A4" should be 'ä', but is now "Ã¤"
-                    // - parse out subsequent escapes
-                    // - check which of these 'belong together', see http://de.wikipedia.org/wiki/UTF-8
-                    // - write the UTF-8 escapes together as one char
-                    
-                    // XXX: implement an own UTF-8 converter as described above or
-                    // revert this method to use bytes and let Java do the UTF-8-parsing?
-                    // i.e. new String(result[], "UTF-8");
-                	baos.write(Integer.parseInt(s.substring(pos + 1, pos + 3), 16));
-            		pos += 3;
-            	}
-            	// end change by [FB]
+                try {
+                    if (s.length() >= pos + 6 && (s.charAt(pos + 1) == 'u' || s.charAt(pos + 1) == 'U')) {
+                        // non-standard encoding of IE for unicode-chars
+                        int bh = Integer.parseInt(s.substring(pos + 2, pos + 4), 16);
+                        int bl = Integer.parseInt(s.substring(pos + 4, pos + 6), 16);
+                        // TODO: needs conversion from UTF-16 to UTF-8
+                        baos.write(bh);
+                        baos.write(bl);
+                        pos += 6;
+                    } else if (s.length() >= pos + 3) {
+                        baos.write(Integer.parseInt(s.substring(pos + 1, pos + 3), 16));
+                        pos += 3;
+                    } else {
+                        baos.write(s.charAt(pos++));
+                    }
+                } catch (NumberFormatException e) {
+                    baos.write(s.charAt(pos++));
+                }
+            } else if (s.charAt(pos) > 127) {
+                // Unicode chars sent by client, see http://www.w3.org/International/O-URL-code.html
+                try {
+                    // don't write anything but url-encode the unicode char
+                    s = s.substring(0, pos) + URLEncoder.encode(s.substring(pos, pos + 1), "UTF-8") + s.substring(pos + 1); 
+                } catch (UnsupportedEncodingException e) { return null; }
            } else {
                baos.write(s.charAt(pos++));
            }
        }
-        return decodeHtmlEntities(baos.toString());
+        
+        try {
+            return new String(baos.toByteArray(), "UTF-8");
+        } catch (UnsupportedEncodingException e) { return null; }
+    }
+    
+    public static void main(String[] args) {
+        System.out.println(Charset.availableCharsets().toString().replaceAll(" ", "\n"));
    }
    
    // 06.01.2007: decode HTML entities by [FB]