From 900dacbf972e15595d3118991ab033d6e573fcae Mon Sep 17 00:00:00 2001 From: f1ori Date: Wed, 1 Jun 2011 13:27:04 +0000 Subject: [PATCH] * improve link rewriting in proxy-url * only rewrites links, which are in current search domain git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7765 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 2 + .../anomic/http/server/HTTPDFileHandler.java | 60 +++++++++++++------ 2 files changed, 45 insertions(+), 17 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 461596933..1562dad83 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -507,6 +507,8 @@ proxyYacyOnly=false # enable proxy via url (/proxy.html?url=http://yacy.net) proxyURL=false proxyURL.access=127.0.0.1,0:0:0:0:0:0:0:1 +# which urls to rewrite to /proxy.html?url=x (values: all, domainlist) +proxyURL.rewriteURLs=domainlist # From the 'IndexCreate' menu point you can also define a crawling start point. # The crawling works the same way as the prefetch, but it is possible to diff --git a/source/de/anomic/http/server/HTTPDFileHandler.java b/source/de/anomic/http/server/HTTPDFileHandler.java index 31a04a3f3..611aa47ef 100644 --- a/source/de/anomic/http/server/HTTPDFileHandler.java +++ b/source/de/anomic/http/server/HTTPDFileHandler.java @@ -81,6 +81,7 @@ import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.zip.GZIPOutputStream; @@ -1374,24 +1375,49 @@ public final class HTTPDFileHandler { } String sbuffer = buffer.toString(); - - // urls of form href="http://domain.com/path" - sbuffer = sbuffer.replaceAll("(href|src)=\"http([^\"]+)\"", "$1=\"/proxy.html?url=http$2\""); - sbuffer = sbuffer.replaceAll("(href|src)='http([^']+)'", "$1='/proxy.html?url=http$2'"); - sbuffer = sbuffer.replaceAll("url\\('http([^']+)'\\)", "url('/proxy.html?url=http$1')"); - sbuffer = sbuffer.replaceAll("url\\(http([^\\)]+)\\)'", "url(/proxy.html?url=http$1)"); - // urls of form href="/absolute/path/to/linked/page" - sbuffer = sbuffer.replaceAll("(href|src)=\"/([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+"/$2\""); - sbuffer = sbuffer.replaceAll("(href|src)='/([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+"/$2'"); - sbuffer = sbuffer.replaceAll("url\\('/([^:']+)'\\)", "url('/proxy.html?url=http://"+proxyurl.getHost()+"/$1')"); - sbuffer = sbuffer.replaceAll("url\\(/([^:\\)]+)\\)", "url(/proxy.html?url=http://"+proxyurl.getHost()+"/$1)"); - // urls of form href="relative/path" - sbuffer = sbuffer.replaceAll("(href|src)=\"([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2\""); - sbuffer = sbuffer.replaceAll("(href|src)='([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2'"); - sbuffer = sbuffer.replaceAll("url\\('([^:']+)'\\)", "url\\('/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1')"); - sbuffer = sbuffer.replaceAll("url\\(([^:\\)]+)\\)", "url\\(/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1)"); - byte[] sbb = UTF8.getBytes(sbuffer); + Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)"); + Matcher m = p.matcher(buffer.toString()); + StringBuffer result = new StringBuffer(); + while (m.find()) { + String init = null; + if(m.group(1) != null) init = m.group(1); + if(m.group(3) != null) init = m.group(3); + if(m.group(5) != null) init = m.group(5); + if(m.group(7) != null) init = m.group(7); + if(m.group(9) != null) init = m.group(9); + String url = null; + if(m.group(2) != null) url = m.group(2); + if(m.group(4) != null) url = m.group(4); + if(m.group(6) != null) url = m.group(6); + if(m.group(8) != null) url = m.group(8); + if(m.group(10) != null) url = m.group(10); + if (url.startsWith("data:") || url.startsWith("#")) { + m.appendReplacement(result, init + url); + + } else if (url.startsWith("http")) { + // absoulte url of form href="http://domain.com/path" + if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(url)) != null) { + continue; + } + } + + m.appendReplacement(result, init + "/proxy.html?url=" + url); + + } else if (url.startsWith("/")) { + // absolute path of form href="/absolute/path/to/linked/page" + m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + url); + + } else { + // relative path of form href="relative/path" + m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + directory + "/" + url); + + } + } + m.appendTail(result); + + byte[] sbb = UTF8.getBytes(result.toString()); if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) { HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);