improve detection of referenced links by taking http and https link protocol

into account
+ correct query start detection of commit f89d4eb51d
pull/65/head
reger 9 years ago
parent aed44e31ca
commit 27163af0e1

@ -219,8 +219,8 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (q < 0) { // check for www.test.com?searchpart
q = url.indexOf("?", p + 3);
} else { // check that '/' was not in searchpart (example http://test.com?data=1/2/3)
if (url.lastIndexOf("?",q) >= 0) {
q = url.lastIndexOf("?",q);
if (url.lastIndexOf("?", q) >= 0) {
q = url.indexOf("?", p + 3);
}
}
int r;

@ -638,10 +638,13 @@ dc_rights
return v;
}
/**
* We find all links that are part of a reference inside a url
*
* @param links links is either a Set of AnchorURL, Strings (with urls) or htmlFilterImageEntries
* @return map with contained urls as key and "ref" as value
*/
private static Map<AnchorURL, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
// we find all links that are part of a reference inside a url
final Map<AnchorURL, String> v = new HashMap<AnchorURL, String>();
final Iterator<?> i = links.iterator();
Object o;
@ -663,7 +666,9 @@ dc_rights
continue loop;
}
u = url.toNormalform(true);
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) {
// find start of a referenced http url
if ((pos = u.toLowerCase().indexOf("http://", 7)) > 0) { // 7 = skip the protocol part of the source url
i.remove();
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("http://", 7)) > 0)
@ -673,16 +678,30 @@ dc_rights
v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0) {
// find start of a referenced https url
if ((pos = u.toLowerCase().indexOf("https://", 7)) > 0) { // 7 = skip the protocol part of the source url
i.remove();
u = "http:/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 7)) > 0)
u = "http:/" + u.substring(pos);
u = u.substring(pos);
while ((pos = u.toLowerCase().indexOf("https://", 7)) > 0)
u = u.substring(pos);
url = new AnchorURL(u);
if (!(v.containsKey(url)))
v.put(url, "ref");
continue loop;
}
if ((pos = u.toLowerCase().indexOf("/www.", 11)) > 0) { // 11 = skip protocol part + www of source url "http://www."
i.remove();
u = url.getProtocol()+":/" + u.substring(pos);
while ((pos = u.toLowerCase().indexOf("/www.", 11)) > 0)
u = url.getProtocol()+":/" + u.substring(pos);
AnchorURL addurl = new AnchorURL(u);
if (!(v.containsKey(addurl)))
v.put(addurl, "ref");
continue loop;
}
} catch (final MalformedURLException e) {
}
return v;

Loading…
Cancel
Save