luc 9 years ago
commit b4cdacee76

@ -204,7 +204,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
int p = url.indexOf("://"); int p = url.indexOf("://");
if (p < 0) { if (p < 0) {
if (url.startsWith("mailto:")) { if (url.length() > 7 && url.substring(0,7).equalsIgnoreCase("mailto:")) {
p = 6; p = 6;
} else { } else {
url = "http://" + url; url = "http://" + url;

@ -491,6 +491,9 @@ dc_rights
return this.lat; return this.lat;
} }
/**
* sorts all links (anchors) into individual collections
*/
private void resortLinks() { private void resortLinks() {
if (this.resorted) return; if (this.resorted) return;
synchronized (this) { synchronized (this) {
@ -513,6 +516,14 @@ dc_rights
} }
for (final AnchorURL url: this.anchors) { for (final AnchorURL url: this.anchors) {
if (url == null) continue; if (url == null) continue;
u = url.toNormalform(true);
final String name = url.getNameProperty();
// check mailto scheme first (not suppose to get into in/outboundlinks or hyperlinks -> crawler can't process)
if (url.getProtocol().equals("mailto")) {
this.emaillinks.put(u.substring(7), name); // TODO: check why key as string instead of Disgest/AnchorURL
continue;
}
final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0; final boolean noindex = url.getRelProperty().toLowerCase().indexOf("noindex",0) >= 0;
final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0; final boolean nofollow = url.getRelProperty().toLowerCase().indexOf("nofollow",0) >= 0;
if ((thishost == null && url.getHost() == null) || if ((thishost == null && url.getHost() == null) ||
@ -523,31 +534,24 @@ dc_rights
} else { } else {
this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : "")); this.outboundlinks.put(url, "anchor" + (noindex ? " noindex" : "") + (nofollow ? " nofollow" : ""));
} }
u = url.toNormalform(true); extpos = u.lastIndexOf('.');
final String name = url.getNameProperty(); if (extpos > 0) {
if (u.startsWith("mailto:")) { if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) {
this.emaillinks.put(u.substring(7), name); ext = u.substring(extpos + 1, qpos).toLowerCase();
} else { } else {
extpos = u.lastIndexOf('.'); ext = u.substring(extpos + 1).toLowerCase();
if (extpos > 0) { }
if (((qpos = u.indexOf('?')) >= 0) && (qpos > extpos)) { if (Classification.isMediaExtension(ext)) {
ext = u.substring(extpos + 1, qpos).toLowerCase(); // this is not a normal anchor, its a media link
} else { if (Classification.isImageExtension(ext)) { // TODO: guess on a-tag href extension (may not be correct)
ext = u.substring(extpos + 1).toLowerCase(); collectedImages.put(url, new ImageEntry(url, name, -1, -1, -1));
} } else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name);
if (Classification.isMediaExtension(ext)) { else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name);
// this is not a normal anchor, its a media link else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name);
if (Classification.isImageExtension(ext)) {
collectedImages.put(url, new ImageEntry(url, name, -1, -1, -1));
}
else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name);
else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name);
else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name);
}
} }
// in any case we consider this as a link and let the parser decide if that link can be followed
this.hyperlinks.put(url, name);
} }
// in any case we consider this as a link and let the parser decide if that link can be followed
this.hyperlinks.put(url, name);
} }
// add image links that we collected from the anchors to the image map // add image links that we collected from the anchors to the image map

@ -144,6 +144,28 @@ public class MultiProtocolURLTest {
} }
} }
/**
* Test getProtocol()
*/
@Test
public void testGetProtocol() throws MalformedURLException {
Map<String, String> testurls = new HashMap<String, String>();
// ( 1. parameter = urlstring to test, 2. parameter = expected protocol)
testurls.put("http://host.com", "http");
testurls.put("HTTPS://host.com", "https");
testurls.put("Ftp://host.com", "ftp");
testurls.put("SMB://host.com", "smb");
testurls.put("/file.com", "file");
testurls.put("file://host.com/file.com", "file");
testurls.put("MailTo:Abc@host.com", "mailto");
for (String txt : testurls.keySet()) {
MultiProtocolURL url = new MultiProtocolURL(txt);
assertEquals("test " + txt, url.getProtocol(), testurls.get(txt));
}
}
/** /**
* Test of toNormalform method, of class MultiProtocolURL. * Test of toNormalform method, of class MultiProtocolURL.
*/ */

Loading…
Cancel
Save