diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 193dee4be..05f2b3ec0 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -1,4 +1,4 @@ -// htmlFilterContentScraper.java +// htmlFilterContentScraper.java // ----------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de @@ -43,6 +43,9 @@ package de.anomic.htmlFilter; +import de.anomic.server.logging.serverLog; +import de.anomic.server.serverByteBuffer; + import java.net.MalformedURLException; import java.net.URL; import java.text.Collator; @@ -52,11 +55,9 @@ import java.util.List; import java.util.Locale; import java.util.Map; import java.util.Properties; -import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; -import de.anomic.server.logging.serverLog; -import de.anomic.server.serverByteBuffer; +import java.util.TreeSet; public class htmlFilterContentScraper extends htmlFilterAbstractScraper implements htmlFilterScraper { @@ -170,10 +171,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } else if (url.getProtocol().equals("https")) { if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; } } - String path = url.getFile(); + String path = url.getFile().toLowerCase(); // (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.) - if (path.length() == 0 || path.charAt(0) != '/') path = "/" + path; + if (path.length() == 0 || path.charAt(0) != '/') { path = "/" + path; } Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)"); Matcher matcher = pathPattern.matcher(path); @@ -182,7 +183,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen matcher.reset(path); } - if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path; + if (defaultPort) { return url.getProtocol() + "://" + url.getHost().toLowerCase() + path; } return url.getProtocol() + "://" + url.getHost().toLowerCase() + ":" + url.getPort() + path; }