|
|
@ -160,7 +160,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|
|
|
} else if (url.getProtocol().equals("https")) {
|
|
|
|
} else if (url.getProtocol().equals("https")) {
|
|
|
|
if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; }
|
|
|
|
if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; }
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String path = url.getFile();
|
|
|
|
String path = url.getPath();
|
|
|
|
|
|
|
|
|
|
|
|
// (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.)
|
|
|
|
// (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.)
|
|
|
|
if (path.length() == 0 || path.charAt(0) != '/') path = "/" + path;
|
|
|
|
if (path.length() == 0 || path.charAt(0) != '/') path = "/" + path;
|
|
|
@ -172,6 +172,9 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
|
|
|
|
matcher.reset(path);
|
|
|
|
matcher.reset(path);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
String query = url.getQuery().replaceAll("[\"\\/:*?<>|]", "_");
|
|
|
|
|
|
|
|
if (query != null) { path = path.concat("_").concat(query); }
|
|
|
|
|
|
|
|
|
|
|
|
if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path;
|
|
|
|
if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path;
|
|
|
|
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path;
|
|
|
|
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path;
|
|
|
|
}
|
|
|
|
}
|
|
|
|