- reverted parseArg(String) to use a byte-array to handle correct UTF-8 parsing

- arguments aren't passed html-escaped to the servlets anymore, bug-fix for http://www.yacy-forum.de/viewtopic.php?p=30573

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3321 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
karlchenofhell 18 years ago
parent e00e850a98
commit e68cdeeeb3

@ -17,7 +17,7 @@
#[promoteSearchPageGreeting]#
</h2>
<form class="search" action="yacysearch.html" method="get" name="searchform" accept-charset="ascii">
<form class="search" action="yacysearch.html" method="get" name="searchform" accept-charset="UTF-8">
<fieldset class="maininput">
<input type="hidden" name="display" value="#[display]#" />
<input name="search" id="search" type="text" size="52" maxlength="80" value="#[former]#" />
@ -28,13 +28,13 @@
<input type="radio" id="audio" name="contentdom" value="audio" #(contentdomCheckAudio)#::checked="checked"#(/contentdomCheckAudio)# /><label for="audio">Audio</label>&nbsp;&nbsp;
<input type="radio" id="video" name="contentdom" value="video" #(contentdomCheckVideo)#::checked="checked"#(/contentdomCheckVideo)# /><label for="video">Video</label>&nbsp;&nbsp;
<input type="radio" id="app" name="contentdom" value="app" #(contentdomCheckApp)#::checked="checked"#(/contentdomCheckApp)# /><label for="app">Applications</label>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
#(searchoptions)#
#(searchoptions)#<!-- default values are hard-coded
<input type="hidden" name="count" value="10" />
<input type="hidden" name="resource" value="global" />
<input type="hidden" name="time" value="6" />
<input type="hidden" name="urlmaskfilter" value=".*" />
<input type="hidden" name="prefermaskfilter" value="" />
<input type="hidden" name="indexof" value="off" />
<input type="hidden" name="indexof" value="off" />-->
</fieldset>
<p><a href="/index.html?searchoptions=1&amp;display=#[display]#" onclick="this.href='/index.html?searchoptions=1&amp;display=#[display]#&amp;handover='+document.searchform.search.value">more options...</a></p>
::

@ -98,6 +98,7 @@ public class yacysearch {
// case if no values are requested
final String referer = (String) header.get("Referer");
String querystring = (post == null) ? "" : post.get("search", "").trim();
if ((post == null) || (env == null) || (querystring.length() == 0)) {
// save referrer
@ -147,6 +148,7 @@ public class yacysearch {
// collect search attributes
int maxDistance = Integer.MAX_VALUE;
if ((querystring.length() > 2) && (querystring.charAt(0) == '"') && (querystring.charAt(querystring.length() - 1) == '"')) {
querystring = querystring.substring(1, querystring.length() - 1).trim();
maxDistance = 1;
@ -459,7 +461,7 @@ public class yacysearch {
prop.put("type_results_" + i + "_authorized", (authenticated) ? 1 : 0);
prop.putASIS("promoteSearchPageGreeting", promoteSearchPageGreeting);
prop.put("former", wikiCode.replaceXMLEntities(post.get("search", "")));
prop.put("former", post.get("search", ""));
prop.put("count", count);
prop.put("order", order);
prop.put("resource", (global) ? "global" : "local");

@ -50,9 +50,13 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Constructor;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
@ -747,7 +751,7 @@ public final class httpd implements serverHandler {
sep = argsString.indexOf("&");
if ((eqp <= 0) || (sep <= 0)) break;
// resulting equations are inserted into the property args with leading '&'
args.put(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
args.putASIS(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
argsString = argsString.substring(sep + 1);
argc++;
}
@ -755,45 +759,72 @@ public final class httpd implements serverHandler {
return argc;
}
// 16.12.2006 by [FB]: added UTF-character decoding
/**
* <p>This method basically does the same as {@link URLDecoder#decode(String, String) URLDecoder.decode(s, "UTF-8")}
* would do with the exception of more lazyness in regard to current browser implementations as they do not
* always comply with the standards.</p>
* <p>The following replacements are performed on the input-<code>String</code>:</p>
* <ul>
* <li>'<code>+</code>'-characters are replaced by space
* <li>(supbsequent (in the case of encoded unicode-chars)) '<code>%HH</code>'-entities are replaced by their
* respective <code>char</code>-representation</li>
* <li>'<code>%uHHHH</code>'-entities (sent by IE although rejected by the W3C) are replaced by their respective
* <code>char</code>-representation</li>
* <li><strong>TODO</strong>: <code>chars</code> already encoded in UTF-8 are url-encoded and re-decoded due to internal restrictions,
* which slows down this method unnecessarily</li>
* </ul>
*
* @param s the URL-encoded <code>String</code> to decode, note that the encoding used to URL-encode the original
* <code>String</code> has to be UTF-8 (i.e. the "<code>accept-charset</code>"-property of HTML
* <code>&lt;form&gt;</code>-elements)
* @return the "normal" Java-<code>String</code> (UTF-8) represented by the input or <code>null</code>
* if the passed argument <code>encoding</code> is not supported
*/
private static String parseArg(String s) {
// this parses a given value-string from a http property
// we replace all "+" by spaces
// and resolve %-escapes with two- / four-digit hex attributes
int pos = 0;
CharArrayWriter baos = new CharArrayWriter(s.length());
ByteArrayOutputStream baos = new ByteArrayOutputStream(s.length());
while (pos < s.length()) {
if (s.charAt(pos) == '+') {
baos.write(' ');
pos++;
} else if (s.charAt(pos) == '%') {
// start change by [FB]: added UTF-escapes
if (s.charAt(pos + 1) == 'u') { // UTF-16 escape
baos.write(Integer.parseInt(s.substring(pos + 2, pos + 6), 16));
pos += 6;
} else { // normal escape
// currently one escape is treated as one char, which only works if
// formulars accept-charset=ascii are used. This is wrong if
// formulars are set to accept-charset=UTF-8, then many escaped bytes
// are sent, but they belong together to one char
// TODO: UTF-8 escapes: i.e. "%C3%A4" should be 'ä', but is now "ä"
// - parse out subsequent escapes
// - check which of these 'belong together', see http://de.wikipedia.org/wiki/UTF-8
// - write the UTF-8 escapes together as one char
// XXX: implement an own UTF-8 converter as described above or
// revert this method to use bytes and let Java do the UTF-8-parsing?
// i.e. new String(result[], "UTF-8");
baos.write(Integer.parseInt(s.substring(pos + 1, pos + 3), 16));
pos += 3;
}
// end change by [FB]
try {
if (s.length() >= pos + 6 && (s.charAt(pos + 1) == 'u' || s.charAt(pos + 1) == 'U')) {
// non-standard encoding of IE for unicode-chars
int bh = Integer.parseInt(s.substring(pos + 2, pos + 4), 16);
int bl = Integer.parseInt(s.substring(pos + 4, pos + 6), 16);
// TODO: needs conversion from UTF-16 to UTF-8
baos.write(bh);
baos.write(bl);
pos += 6;
} else if (s.length() >= pos + 3) {
baos.write(Integer.parseInt(s.substring(pos + 1, pos + 3), 16));
pos += 3;
} else {
baos.write(s.charAt(pos++));
}
} catch (NumberFormatException e) {
baos.write(s.charAt(pos++));
}
} else if (s.charAt(pos) > 127) {
// Unicode chars sent by client, see http://www.w3.org/International/O-URL-code.html
try {
// don't write anything but url-encode the unicode char
s = s.substring(0, pos) + URLEncoder.encode(s.substring(pos, pos + 1), "UTF-8") + s.substring(pos + 1);
} catch (UnsupportedEncodingException e) { return null; }
} else {
baos.write(s.charAt(pos++));
}
}
return decodeHtmlEntities(baos.toString());
try {
return new String(baos.toByteArray(), "UTF-8");
} catch (UnsupportedEncodingException e) { return null; }
}
public static void main(String[] args) {
System.out.println(Charset.availableCharsets().toString().replaceAll(" ", "\n"));
}
// 06.01.2007: decode HTML entities by [FB]

Loading…
Cancel
Save