diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 96ef66877..06ea067d8 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -160,11 +160,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen } else if (url.getProtocol().equals("https")) { if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; } } - String path = url.getFile(); + String path = url.getPath(); // (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.) if (path.length() == 0 || path.charAt(0) != '/') path = "/" + path; - + Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)"); Matcher matcher = pathPattern.matcher(path); while (matcher.find()) { @@ -172,6 +172,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen matcher.reset(path); } + String query = url.getQuery().replaceAll("[\"\\/:*?<>|]", "_"); + if (query != null) { path = path.concat("_").concat(query); } + if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path; return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path; }