* improve link rewriting in proxy-url

* only rewrites links, which are in current search domain

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7765 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 14 years ago
parent 7fea51ecee
commit 900dacbf97

@ -507,6 +507,8 @@ proxyYacyOnly=false
# enable proxy via url (/proxy.html?url=http://yacy.net)
proxyURL=false
proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
# which urls to rewrite to /proxy.html?url=x (values: all, domainlist)
proxyURL.rewriteURLs=domainlist
# From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to

@ -81,6 +81,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream;
@ -1374,24 +1375,49 @@ public final class HTTPDFileHandler {
}
String sbuffer = buffer.toString();
// urls of form href="http://domain.com/path"
sbuffer = sbuffer.replaceAll("(href|src)=\"http([^\"]+)\"", "$1=\"/proxy.html?url=http$2\"");
sbuffer = sbuffer.replaceAll("(href|src)='http([^']+)'", "$1='/proxy.html?url=http$2'");
sbuffer = sbuffer.replaceAll("url\\('http([^']+)'\\)", "url('/proxy.html?url=http$1')");
sbuffer = sbuffer.replaceAll("url\\(http([^\\)]+)\\)'", "url(/proxy.html?url=http$1)");
// urls of form href="/absolute/path/to/linked/page"
sbuffer = sbuffer.replaceAll("(href|src)=\"/([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+"/$2\"");
sbuffer = sbuffer.replaceAll("(href|src)='/([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+"/$2'");
sbuffer = sbuffer.replaceAll("url\\('/([^:']+)'\\)", "url('/proxy.html?url=http://"+proxyurl.getHost()+"/$1')");
sbuffer = sbuffer.replaceAll("url\\(/([^:\\)]+)\\)", "url(/proxy.html?url=http://"+proxyurl.getHost()+"/$1)");
// urls of form href="relative/path"
sbuffer = sbuffer.replaceAll("(href|src)=\"([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2\"");
sbuffer = sbuffer.replaceAll("(href|src)='([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2'");
sbuffer = sbuffer.replaceAll("url\\('([^:']+)'\\)", "url\\('/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1')");
sbuffer = sbuffer.replaceAll("url\\(([^:\\)]+)\\)", "url\\(/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1)");
byte[] sbb = UTF8.getBytes(sbuffer);
Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
Matcher m = p.matcher(buffer.toString());
StringBuffer result = new StringBuffer();
while (m.find()) {
String init = null;
if(m.group(1) != null) init = m.group(1);
if(m.group(3) != null) init = m.group(3);
if(m.group(5) != null) init = m.group(5);
if(m.group(7) != null) init = m.group(7);
if(m.group(9) != null) init = m.group(9);
String url = null;
if(m.group(2) != null) url = m.group(2);
if(m.group(4) != null) url = m.group(4);
if(m.group(6) != null) url = m.group(6);
if(m.group(8) != null) url = m.group(8);
if(m.group(10) != null) url = m.group(10);
if (url.startsWith("data:") || url.startsWith("#")) {
m.appendReplacement(result, init + url);
} else if (url.startsWith("http")) {
// absoulte url of form href="http://domain.com/path"
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(url)) != null) {
continue;
}
}
m.appendReplacement(result, init + "/proxy.html?url=" + url);
} else if (url.startsWith("/")) {
// absolute path of form href="/absolute/path/to/linked/page"
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + url);
} else {
// relative path of form href="relative/path"
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + directory + "/" + url);
}
}
m.appendTail(result);
byte[] sbb = UTF8.getBytes(result.toString());
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);

Loading…
Cancel
Save