Made "tld:" modifier case insensitive and IDN complient.

Thus allowing typing internationalized top-level domains with non ASCII
characters as tld: modifier.
pull/149/head
luccioman 7 years ago
parent a4494d6e01
commit f9cba827c0

@ -29,6 +29,7 @@
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.IDN;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
@ -36,6 +37,7 @@ import java.util.Collection;
import java.util.ConcurrentModificationException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
@ -497,18 +499,34 @@ public class yacysearch {
modifier.add("/heuristic");
}
final int tldp = querystring.indexOf("tld:", 0);
final String tldModifierPrefix = "tld:";
final int tldp = querystring.indexOf(tldModifierPrefix, 0);
if (tldp >= 0) {
int ftb = querystring.indexOf(' ', tldp);
if (ftb == -1) ftb = querystring.length();
tld = querystring.substring(tldp + 4, ftb);
querystring = querystring.replace("tld:" + tld, "");
modifier.add("tld:" + tld);
if (ftb == -1) {
ftb = querystring.length();
}
tld = querystring.substring(tldp + tldModifierPrefix.length(), ftb);
querystring = querystring.replace(tldModifierPrefix + tld, "");
modifier.add(tldModifierPrefix + tld);
while ( tld.length() > 0 && tld.charAt(0) == '.' ) {
tld = tld.substring(1);
}
if (tld.length() == 0) tld = null;
if (tld.length() == 0) {
tld = null;
} else {
try {
/* Convert to the same lower case ASCII Compatible Encoding that is used in normalized URLs */
tld = IDN.toASCII(tld, 0);
} catch(final IllegalArgumentException e){
ConcurrentLog.warn("LOCAL_SEARCH", "Failed to convert tld modifier value " + tld + "to ASCII Compatible Encoding (ACE)", e);
}
/* Domain name in an URL is case insensitive : convert now modifier to lower case for further processing over normalized URLs */
tld = tld.toLowerCase(Locale.ROOT);
}
}
if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given
// read the language from the language-restrict option 'lr'

Loading…
Cancel
Save