From 795f4882222d4127c5427b00ed7da67d326bcb41 Mon Sep 17 00:00:00 2001 From: borg-0300 Date: Sun, 6 Nov 2005 22:35:56 +0000 Subject: [PATCH] new urlNormalform version git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1040 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../htmlFilter/htmlFilterContentScraper.java | 46 ++++++++++++++++--- 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java index 1f728e5de..612711807 100644 --- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java +++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java @@ -108,16 +108,16 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen if (url == null) return null; return urlNormalform(url.toString()); } - +/* public static String urlNormalform(String us) { - serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us); if (us == null) { return null; } if (us.length() == 0) { return null; } - /* TODO: what about - * - case insensitive domain names - * - chars that should be escaped in URLs - */ + serverLog.logFiner("htmlFilter", "urlNormalform: IN=" + us); + + // TODO: what about + // - case insensitive domain names + // - chars that should be escaped in URLs // cutting of everything behind # int cpos = us.indexOf("#"); @@ -149,6 +149,40 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen serverLog.logFine("htmlFilter", "urlNormalform: OUT=" + us); return us; } + */ + + public static String urlNormalform(String us) { + if (us == null) { return null; } + if (us.length() == 0) { return null; } + serverLog.logFinest("htmlFilter", "urlNormalform: '" + us + "'"); + try { + final URL url = new URL(us); + boolean defaultPort = false; + if (url.getProtocol().equals("http")) { + if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; } + } else if (url.getProtocol().equals("ftp")) { + if (url.getPort() < 0 || url.getPort() == 21) { defaultPort = true; } + } else if (url.getProtocol().equals("https")) { + if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; } + } + if (defaultPort) { + if (url.getFile().equals("/")) { + return url.getProtocol() + "://" + url.getHost(); + } else { + return url.getProtocol() + "://" + url.getHost() + url.getFile(); + } + } else { + if (url.getFile().equals("/")) { + return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort(); + } else { + return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + url.getFile(); + } + } + } catch (MalformedURLException e) { + serverLog.logSevere("urlNormalform", e.toString()); + } + return null; + } private String absolutePath(String relativePath) { try {