* improve link rewriting in proxy-url

* only rewrites links, which are in current search domain

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7765 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
f1ori 14 years ago
parent 7fea51ecee
commit 900dacbf97

@ -507,6 +507,8 @@ proxyYacyOnly=false
# enable proxy via url (/proxy.html?url=http://yacy.net) # enable proxy via url (/proxy.html?url=http://yacy.net)
proxyURL=false proxyURL=false
proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1 proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1
# which urls to rewrite to /proxy.html?url=x (values: all, domainlist)
proxyURL.rewriteURLs=domainlist
# From the 'IndexCreate' menu point you can also define a crawling start point. # From the 'IndexCreate' menu point you can also define a crawling start point.
# The crawling works the same way as the prefetch, but it is possible to # The crawling works the same way as the prefetch, but it is possible to

@ -81,6 +81,7 @@ import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import java.util.zip.GZIPOutputStream; import java.util.zip.GZIPOutputStream;
@ -1375,23 +1376,48 @@ public final class HTTPDFileHandler {
String sbuffer = buffer.toString(); String sbuffer = buffer.toString();
// urls of form href="http://domain.com/path" Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
sbuffer = sbuffer.replaceAll("(href|src)=\"http([^\"]+)\"", "$1=\"/proxy.html?url=http$2\""); Matcher m = p.matcher(buffer.toString());
sbuffer = sbuffer.replaceAll("(href|src)='http([^']+)'", "$1='/proxy.html?url=http$2'"); StringBuffer result = new StringBuffer();
sbuffer = sbuffer.replaceAll("url\\('http([^']+)'\\)", "url('/proxy.html?url=http$1')"); while (m.find()) {
sbuffer = sbuffer.replaceAll("url\\(http([^\\)]+)\\)'", "url(/proxy.html?url=http$1)"); String init = null;
// urls of form href="/absolute/path/to/linked/page" if(m.group(1) != null) init = m.group(1);
sbuffer = sbuffer.replaceAll("(href|src)=\"/([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+"/$2\""); if(m.group(3) != null) init = m.group(3);
sbuffer = sbuffer.replaceAll("(href|src)='/([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+"/$2'"); if(m.group(5) != null) init = m.group(5);
sbuffer = sbuffer.replaceAll("url\\('/([^:']+)'\\)", "url('/proxy.html?url=http://"+proxyurl.getHost()+"/$1')"); if(m.group(7) != null) init = m.group(7);
sbuffer = sbuffer.replaceAll("url\\(/([^:\\)]+)\\)", "url(/proxy.html?url=http://"+proxyurl.getHost()+"/$1)"); if(m.group(9) != null) init = m.group(9);
// urls of form href="relative/path" String url = null;
sbuffer = sbuffer.replaceAll("(href|src)=\"([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2\""); if(m.group(2) != null) url = m.group(2);
sbuffer = sbuffer.replaceAll("(href|src)='([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2'"); if(m.group(4) != null) url = m.group(4);
sbuffer = sbuffer.replaceAll("url\\('([^:']+)'\\)", "url\\('/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1')"); if(m.group(6) != null) url = m.group(6);
sbuffer = sbuffer.replaceAll("url\\(([^:\\)]+)\\)", "url\\(/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1)"); if(m.group(8) != null) url = m.group(8);
if(m.group(10) != null) url = m.group(10);
byte[] sbb = UTF8.getBytes(sbuffer); if (url.startsWith("data:") || url.startsWith("#")) {
m.appendReplacement(result, init + url);
} else if (url.startsWith("http")) {
// absoulte url of form href="http://domain.com/path"
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(url)) != null) {
continue;
}
}
m.appendReplacement(result, init + "/proxy.html?url=" + url);
} else if (url.startsWith("/")) {
// absolute path of form href="/absolute/path/to/linked/page"
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + url);
} else {
// relative path of form href="relative/path"
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + directory + "/" + url);
}
}
m.appendTail(result);
byte[] sbb = UTF8.getBytes(result.toString());
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) { if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader); HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);

Loading…
Cancel
Save