diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 371bedd81..4e118eb6c 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -53,7 +53,6 @@ import java.util.Properties; import java.util.TreeSet; import java.util.regex.Matcher; import java.util.regex.Pattern; - import de.anomic.server.logging.serverLog; import de.anomic.server.serverByteBuffer; @@ -150,10 +149,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen return us; } */ - + public static String urlNormalform(URL url) { boolean defaultPort = false; - //serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'"); + // serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'"); if (url.getProtocol().equals("http")) { if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; } } else if (url.getProtocol().equals("ftp")) { @@ -162,25 +161,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; } } String path = url.getFile(); - if ((path.length() == 0) || (path.charAt(0) != '/')) path = "/" + path; + // (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.) - int cpos = path.indexOf("#"); - if (cpos >= 0) path = path.substring(0, cpos); - - Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)(?