Do locale independant case conversion on hosts, schemes, and file exts.

Required for proper operation when the default system locale is Turkish,
as dottless and dotted i characters have specific case conversion rules
in this language.
pull/154/head
luccioman 7 years ago
parent 1c4803e40a
commit 5db1c9155a

@ -2,6 +2,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.Locale;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
@ -38,7 +39,7 @@ public class get_metadata {
String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
boolean hasProtocol = false;
for (final YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
if(url.toLowerCase().startsWith(p.protocol())) {
if(url.toLowerCase(Locale.ROOT).startsWith(p.protocol())) {
hasProtocol = true;
break;
}

@ -30,6 +30,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
@ -253,7 +254,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter {
writer.write("<media:content medium=\"image\" url=\"");
XML.escapeCharData(imageurl, writer); writer.write("\"/>\n");
} else {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase()) == Response.DT_IMAGE) {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(Locale.ROOT)) == Response.DT_IMAGE) {
writer.write("<media:content medium=\"image\" url=\"");
XML.escapeCharData(url.toNormalform(true), writer); writer.write("\"/>\n");
}

@ -28,6 +28,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.id.MultiProtocolURL;
@ -220,7 +221,7 @@ public class YJsonResponseWriter implements QueryResponseWriter {
String imageurl = images_protocol.get(0) + "://" + images_stub.get(0);
solitaireTag(writer, "image", imageurl);
} else {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase()) == Response.DT_IMAGE) {
if (url != null && Response.docTypeExt(MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase(Locale.ROOT)) == Response.DT_IMAGE) {
solitaireTag(writer, "image", url.toNormalform(true));
}
}

@ -32,6 +32,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@ -717,7 +718,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
String host = url.getHost();
if (host == null) continue;
if (host.startsWith("www.")) host = host.substring(4);
filter.append(Pattern.quote(host.toLowerCase())).append(".*|");
filter.append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(".*|");
}
filter.setCharAt(filter.length() - 1, ')');
return filter.toString();
@ -746,7 +747,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (host.startsWith("www.")) host = host.substring(4);
String protocol = url.getProtocol();
if ("http".equals(protocol) || "https".equals(protocol)) protocol = "https?+";
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase())).append(url.getPath()).append(".*").toString();
return new StringBuilder(host.length() + 20).append(protocol).append("://(www.)?").append(Pattern.quote(host.toLowerCase(Locale.ROOT))).append(url.getPath()).append(".*").toString();
}
public boolean isPushCrawlProfile() {

@ -27,6 +27,7 @@ package net.yacy.crawler.retrieval;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Locale;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
@ -119,7 +120,7 @@ public final class HTTPLoader {
port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
"url in blacklist", -1);
@ -337,7 +338,7 @@ public final class HTTPLoader {
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (blacklistType != null && Switchboard.urlBlacklist.isListed(blacklistType, hostlow, path)) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
@ -466,7 +467,7 @@ public final class HTTPLoader {
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
final String hostlow = host.toLowerCase(Locale.ROOT);
if (Switchboard.urlBlacklist != null && Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, hostlow, path)) {
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}

@ -33,6 +33,7 @@ import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII;
@ -62,7 +63,7 @@ public class RobotsTxtEntry {
private String info; // this is filled if robots disallowed access; then the reason is noted there;
protected RobotsTxtEntry(final String hostName, final Map<String, byte[]> mem) {
this.hostName = hostName.toLowerCase();
this.hostName = hostName.toLowerCase(Locale.ROOT);
this.mem = mem;
this.info = "";
@ -100,7 +101,7 @@ public class RobotsTxtEntry {
) {
if (theURL == null) throw new IllegalArgumentException("The url is missing");
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(Locale.ROOT);
this.allowPathList = new LinkedList<String>();
this.denyPathList = new LinkedList<String>();
this.sitemapList = new LinkedList<String>();

@ -34,6 +34,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
@ -982,7 +983,7 @@ public final class SeedDB implements AlternativeDomainNames {
host = host.substring(p + 1); // if ever, the double-dots are here but do not harm
}
// identify domain
final String domain = host.substring(0, host.length() - 5).toLowerCase();
final String domain = host.substring(0, host.length() - 5).toLowerCase(Locale.ROOT);
seed = lookupByName(domain);
if (seed == null) return null;
if (this.mySeed == null) initMySeed();

@ -38,6 +38,7 @@ import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@ -70,7 +71,7 @@ public class Blacklist {
@Override
public final String toString () {
return super.toString().toLowerCase();
return super.toString().toLowerCase(Locale.ROOT);
}
}
@ -367,7 +368,7 @@ public class Blacklist {
// avoid PatternSyntaxException e
final String h = ((!isMatchable(safeHost) && !safeHost.isEmpty() && safeHost.charAt(0) == '*')
? "." + safeHost : safeHost).toLowerCase();
? "." + safeHost : safeHost).toLowerCase(Locale.ROOT);
if (!p.isEmpty() && p.charAt(0) == '*') {
p = "." + p;
}
@ -436,7 +437,7 @@ public class Blacklist {
String p = (!path.isEmpty() && path.charAt(0) == '/') ? path.substring(1) : path;
// avoid PatternSyntaxException e
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
h = Punycode.isBasic(h) ? h : MultiProtocolURL.toPunycode(h);
@ -516,7 +517,7 @@ public class Blacklist {
final Map<String, Set<Pattern>> blacklistMap = getBlacklistMap(blacklistType, isMatchable(host));
// avoid PatternSyntaxException e
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase();
final String h = ((!isMatchable(host) && !host.isEmpty() && host.charAt(0) == '*') ? "." + host : host).toLowerCase(Locale.ROOT);
final Set<Pattern> hostList = blacklistMap.get(h);
if (hostList != null) {
@ -549,7 +550,7 @@ public class Blacklist {
HandleSet urlHashCache = getCacheUrlHashsSet(blacklistType);
if (urlHashCache == null) {
urlHashCache = new RowHandleSet(Word.commonHashLength, Word.commonHashOrder, 0);
if (isListed(blacklistType, url.getHost().toLowerCase(), url.getFile())) {
if (isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile())) {
try {
urlHashCache.put(url.hash());
} catch (final SpaceExceededException e) {
@ -559,7 +560,7 @@ public class Blacklist {
}
}
if (!urlHashCache.has(url.hash())) {
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
final boolean temp = isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile());
if (temp) {
try {
urlHashCache.put(url.hash());

@ -4,6 +4,7 @@ import java.io.BufferedReader;
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
@ -77,7 +78,7 @@ public class FilterEngine {
assert entry != null;
int pos; // position between domain and path
if((pos = entry.indexOf('/')) > 0) {
String host = entry.substring(0, pos).trim().toLowerCase();
String host = entry.substring(0, pos).trim().toLowerCase(Locale.ROOT);
final String path = entry.substring(pos + 1).trim();
// avoid PatternSyntaxException e
@ -123,7 +124,7 @@ public class FilterEngine {
return e.containsAll(type);
}
// Cache Miss
return isListed(url.getHost().toLowerCase(), url.getFile());
return isListed(url.getHost().toLowerCase(Locale.ROOT), url.getFile());
}
public static boolean isMatchable (final String host) {

@ -34,6 +34,7 @@ import java.util.Arrays;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Semaphore;
@ -94,7 +95,7 @@ public final class LoaderDispatcher {
public boolean isSupportedProtocol(final String protocol) {
if ((protocol == null) || (protocol.isEmpty())) return false;
return this.supportedProtocols.contains(protocol.trim().toLowerCase());
return this.supportedProtocols.contains(protocol.trim().toLowerCase(Locale.ROOT));
}
@SuppressWarnings("unchecked")
@ -208,7 +209,7 @@ public final class LoaderDispatcher {
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(Locale.ROOT), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}
@ -362,7 +363,7 @@ public final class LoaderDispatcher {
final CrawlProfile crawlProfile = request.profileHandle() == null ? null : this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
// check if url is in blacklist
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(), url.getFile())) {
if (blacklistType != null && host != null && Switchboard.urlBlacklist.isListed(blacklistType, host.toLowerCase(Locale.ROOT), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request.url(), request.depth(), crawlProfile, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.$");
}

@ -29,6 +29,7 @@ import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.SortedMap;
import java.util.SortedSet;
@ -257,7 +258,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
*/
private static boolean isUrlBlacklisted (final BlacklistType blacklistType, final DigestURL url) {
final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(), url.getFile());
final boolean isBlacklisted = Switchboard.urlBlacklist.isListed(blacklistType, url.getHost().toLowerCase(Locale.ROOT), url.getFile());
if (isBlacklisted) {

@ -62,6 +62,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Locale;
import java.util.logging.FileHandler;
import java.util.logging.Level;
import java.util.logging.LogManager;
@ -314,7 +315,7 @@ public final class HTTPDProxyHandler {
// handle outgoing cookies
handleOutgoingCookies(requestHeader, url.getHost(), ip);
prepareRequestHeader(conProp, requestHeader, url.getHost().toLowerCase());
prepareRequestHeader(conProp, requestHeader, url.getHost().toLowerCase(Locale.ROOT));
final ResponseHeader cachedResponseHeader = Cache.getResponseHeader(url.hash());
// why are files unzipped upon arrival? why not zip all files in cache?
@ -1042,7 +1043,7 @@ public final class HTTPDProxyHandler {
int orgHostPort = orgurl.getPort();
String orgHostName = orgurl.getHost();
if (orgHostName == null) orgHostName = "unknown";
orgHostName = orgHostName.toLowerCase();
orgHostName = orgHostName.toLowerCase(Locale.ROOT);
String orgHostPath = orgurl.getPath(); if (orgHostPath == null) orgHostPath = "";
String orgHostArgs = orgurl.getSearchpart();; if (orgHostArgs == null) orgHostArgs = "";
if (orgHostArgs.length() > 0) orgHostArgs = "?" + orgHostArgs;

Loading…
Cancel
Save