added regular expression for '//';

* http://www.yacy-forum.de/viewtopic.php?t=1666
source text removes, url.getFile() has no ref's;

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1259 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
borg-0300 19 years ago
parent 4c824cacba
commit 51433a121f

@ -53,7 +53,6 @@ import java.util.Properties;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.server.serverByteBuffer; import de.anomic.server.serverByteBuffer;
@ -150,10 +149,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return us; return us;
} }
*/ */
public static String urlNormalform(URL url) { public static String urlNormalform(URL url) {
boolean defaultPort = false; boolean defaultPort = false;
//serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'"); // serverLog.logFinest("htmlFilter", "urlNormalform: '" + url.toString() + "'");
if (url.getProtocol().equals("http")) { if (url.getProtocol().equals("http")) {
if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; } if (url.getPort() < 0 || url.getPort() == 80) { defaultPort = true; }
} else if (url.getProtocol().equals("ftp")) { } else if (url.getProtocol().equals("ftp")) {
@ -162,25 +161,23 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; } if (url.getPort() < 0 || url.getPort() == 443) { defaultPort = true; }
} }
String path = url.getFile(); String path = url.getFile();
if ((path.length() == 0) || (path.charAt(0) != '/')) path = "/" + path;
// (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.) // (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.)
int cpos = path.indexOf("#"); if ((path.length() == 0) || (path.charAt(0) != '/')) path = "/" + path;
if (cpos >= 0) path = path.substring(0, cpos);
Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)(?<!/[.]{2}/)[.]{2}(?=/)|/\\.(?=/)|/(?=/)");
Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)(?<!/[.]{2}/)[.]{2}(?=/)|/\\.(?=/)");
Matcher matcher = pathPattern.matcher(path); Matcher matcher = pathPattern.matcher(path);
while (matcher.find()) { while (matcher.find()) {
path = matcher.replaceAll(""); path = matcher.replaceAll("");
matcher.reset(path); matcher.reset(path);
} }
if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path; if (defaultPort) return url.getProtocol() + "://" + url.getHost() + path;
return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path; return url.getProtocol() + "://" + url.getHost() + ":" + url.getPort() + path;
} }
public static String urlNormalform(URL baseURL, String us) { public static String urlNormalform(URL baseURL, String us) {
if (us == null) { return null; } if (us == null || us.length() == 0) { return null; }
if (us.length() == 0) { return null; }
try { try {
if (baseURL == null) return urlNormalform(new URL(us)); if (baseURL == null) return urlNormalform(new URL(us));
return urlNormalform(new URL(baseURL, us)); return urlNormalform(new URL(baseURL, us));
@ -263,14 +260,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
System.out.println("TEXT :" + new String(content.getBytes())); System.out.println("TEXT :" + new String(content.getBytes()));
} }
public static void main(String[] args) { /*
/* public static void main(String[] args) {
try { try {
htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost")); htmlFilterContentScraper scraper = new htmlFilterContentScraper(new URL("http://localhost"));
scraper.scrapeText(test.getBytes()); scraper.scrapeText(test.getBytes());
System.out.println(new String(scraper.getText())); System.out.println(new String(scraper.getText()));
} catch (MalformedURLException e) {} } catch (MalformedURLException e) {}
*/
} }
*/
} }
Loading…
Cancel
Save