new urlNormalform version

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1040 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 20 years ago
parent c86d801b0f
commit 795f488222

@ -108,16 +108,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (url == null) return null; if (url == null) return null;
return urlNormalform(url.toString()); return urlNormalform(url.toString());
} }
/*
public static String urlNormalform(String us) { public static String urlNormalform(String us) {
serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
if (us == null) { return null; } if (us == null) { return null; }
if (us.length() == 0) { return null; } if (us.length() == 0) { return null; }
/* TODO: what about serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us);
* - case insensitive domain names
* - chars that should be escaped in URLs // TODO: what about
*/ // - case insensitive domain names
// - chars that should be escaped in URLs
// cutting of everything behind # // cutting of everything behind #
int cpos = us.indexOf("#"); int cpos = us.indexOf("#");
@ -149,6 +149,40 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
serverLog.logFine("htmlFilter", "urlNormalform: OUT=" + us); serverLog.logFine("htmlFilter", "urlNormalform: OUT=" + us);
return us; return us;
} }
*/
public static String urlNormalform(String us) {
if (us == null) { return null; }
if (us.length() == 0) { return null; }
serverLog.logFinest("htmlFilter", "urlNormalform: '" + us + "'");
try {
final URL url = new URL(us);
boolean defaultPort = false;
if (url.getProtocol().equals("http")) {
if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; }
} else if (url.getProtocol().equals("ftp")) {
if (url.getPort() < 0 || url.getPort() == 21) { defaultPort = true; }
} else if (url.getProtocol().equals("https")) {
if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; }
}
if (defaultPort) {
if (url.getFile().equals("/")) {
return url.getProtocol() + "://" + url.getHost();
} else {
return url.getProtocol() + "://" + url.getHost() + url.getFile();
}
} else {
if (url.getFile().equals("/")) {
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort();
} else {
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + url.getFile();
}
}
} catch (MalformedURLException e) {
serverLog.logSevere("urlNormalform", e.toString());
}
return null;
}
private String absolutePath(String relativePath) { private String absolutePath(String relativePath) {
try { try {

Loading…
Cancel
Save