|
|
@ -81,6 +81,7 @@ import java.util.Iterator;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.Map;
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
|
|
|
|
|
import java.util.regex.Matcher;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.regex.Pattern;
|
|
|
|
import java.util.zip.GZIPOutputStream;
|
|
|
|
import java.util.zip.GZIPOutputStream;
|
|
|
|
|
|
|
|
|
|
|
@ -1375,23 +1376,48 @@ public final class HTTPDFileHandler {
|
|
|
|
|
|
|
|
|
|
|
|
String sbuffer = buffer.toString();
|
|
|
|
String sbuffer = buffer.toString();
|
|
|
|
|
|
|
|
|
|
|
|
// urls of form href="http://domain.com/path"
|
|
|
|
Pattern p = Pattern.compile("(href=\"|src=\")([^\"]+)|(href='|src=')([^']+)|(url\\(')([^']+)|(url\\(\")([^\"]+)|(url\\()([^\\)]+)");
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)=\"http([^\"]+)\"", "$1=\"/proxy.html?url=http$2\"");
|
|
|
|
Matcher m = p.matcher(buffer.toString());
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)='http([^']+)'", "$1='/proxy.html?url=http$2'");
|
|
|
|
StringBuffer result = new StringBuffer();
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\('http([^']+)'\\)", "url('/proxy.html?url=http$1')");
|
|
|
|
while (m.find()) {
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\(http([^\\)]+)\\)'", "url(/proxy.html?url=http$1)");
|
|
|
|
String init = null;
|
|
|
|
// urls of form href="/absolute/path/to/linked/page"
|
|
|
|
if(m.group(1) != null) init = m.group(1);
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)=\"/([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+"/$2\"");
|
|
|
|
if(m.group(3) != null) init = m.group(3);
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)='/([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+"/$2'");
|
|
|
|
if(m.group(5) != null) init = m.group(5);
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\('/([^:']+)'\\)", "url('/proxy.html?url=http://"+proxyurl.getHost()+"/$1')");
|
|
|
|
if(m.group(7) != null) init = m.group(7);
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\(/([^:\\)]+)\\)", "url(/proxy.html?url=http://"+proxyurl.getHost()+"/$1)");
|
|
|
|
if(m.group(9) != null) init = m.group(9);
|
|
|
|
// urls of form href="relative/path"
|
|
|
|
String url = null;
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)=\"([^:\"]+)\"", "$1=\"/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2\"");
|
|
|
|
if(m.group(2) != null) url = m.group(2);
|
|
|
|
sbuffer = sbuffer.replaceAll("(href|src)='([^:']+)'", "$1='/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$2'");
|
|
|
|
if(m.group(4) != null) url = m.group(4);
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\('([^:']+)'\\)", "url\\('/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1')");
|
|
|
|
if(m.group(6) != null) url = m.group(6);
|
|
|
|
sbuffer = sbuffer.replaceAll("url\\(([^:\\)]+)\\)", "url\\(/proxy.html?url=http://"+proxyurl.getHost()+directory+"/$1)");
|
|
|
|
if(m.group(8) != null) url = m.group(8);
|
|
|
|
|
|
|
|
if(m.group(10) != null) url = m.group(10);
|
|
|
|
byte[] sbb = UTF8.getBytes(sbuffer);
|
|
|
|
if (url.startsWith("data:") || url.startsWith("#")) {
|
|
|
|
|
|
|
|
m.appendReplacement(result, init + url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else if (url.startsWith("http")) {
|
|
|
|
|
|
|
|
// absoulte url of form href="http://domain.com/path"
|
|
|
|
|
|
|
|
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
|
|
|
|
|
|
|
|
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURI(url)) != null) {
|
|
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
m.appendReplacement(result, init + "/proxy.html?url=" + url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else if (url.startsWith("/")) {
|
|
|
|
|
|
|
|
// absolute path of form href="/absolute/path/to/linked/page"
|
|
|
|
|
|
|
|
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
|
|
// relative path of form href="relative/path"
|
|
|
|
|
|
|
|
m.appendReplacement(result, init + "/proxy.html?url=http://" + proxyurl.getHost() + directory + "/" + url);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
m.appendTail(result);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
byte[] sbb = UTF8.getBytes(result.toString());
|
|
|
|
|
|
|
|
|
|
|
|
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
|
|
|
|
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)) {
|
|
|
|
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);
|
|
|
|
HTTPDemon.sendRespondHeader(conProp, out, httpVersion, httpStatus, outgoingHeader);
|
|
|
|