diff --git a/defaults/web.xml b/defaults/web.xml
index 68e84d995..6c8b409bb 100644
--- a/defaults/web.xml
+++ b/defaults/web.xml
@@ -42,7 +42,11 @@
URLProxyServlet
+
+ net.yacy.http.servlets.UrlProxyServlet
@@ -61,6 +65,7 @@
URLProxyServlet
/proxy.html
+ /proxy
diff --git a/source/net/yacy/http/Jetty8HttpServerImpl.java b/source/net/yacy/http/Jetty8HttpServerImpl.java
index 56c42ef3d..8e9b3c00e 100644
--- a/source/net/yacy/http/Jetty8HttpServerImpl.java
+++ b/source/net/yacy/http/Jetty8HttpServerImpl.java
@@ -44,7 +44,6 @@ import net.yacy.http.servlets.GSAsearchServlet;
import net.yacy.http.servlets.SolrSelectServlet;
import net.yacy.http.servlets.SolrServlet;
import net.yacy.http.servlets.YaCyDefaultServlet;
-import net.yacy.http.servlets.YaCyProxyServlet;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.utils.PKCS12Tool;
@@ -150,7 +149,7 @@ public class Jetty8HttpServerImpl implements YaCyHttpServer {
htrootContext.addServlet(SolrServlet.class, "/solr/webgraph/admin/luke");
// add proxy?url= servlet
- htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html");
+ //htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html");
// add GSA servlet
htrootContext.addServlet(GSAsearchServlet.class,"/gsa/search");
diff --git a/source/net/yacy/http/servlets/UrlProxyServlet.java b/source/net/yacy/http/servlets/UrlProxyServlet.java
new file mode 100644
index 000000000..bc509963d
--- /dev/null
+++ b/source/net/yacy/http/servlets/UrlProxyServlet.java
@@ -0,0 +1,352 @@
+package net.yacy.http.servlets;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.net.URL;
+import java.net.URLDecoder;
+import java.util.HashMap;
+import java.util.StringTokenizer;
+import javax.servlet.Servlet;
+import javax.servlet.ServletConfig;
+import javax.servlet.ServletException;
+import javax.servlet.ServletRequest;
+import javax.servlet.ServletResponse;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+import net.yacy.cora.document.encoding.UTF8;
+import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.protocol.ClientIdentification;
+import net.yacy.cora.protocol.Domains;
+import net.yacy.cora.protocol.HeaderFramework;
+import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.cora.protocol.ResponseHeader;
+import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.http.ProxyHandler;
+import net.yacy.kelondro.util.FileUtils;
+import net.yacy.search.Switchboard;
+import net.yacy.server.http.ChunkedInputStream;
+import net.yacy.server.http.HTTPDProxyHandler;
+import org.eclipse.jetty.continuation.Continuation;
+import org.eclipse.jetty.continuation.ContinuationSupport;
+import org.eclipse.jetty.http.HttpURI;
+import org.eclipse.jetty.servlets.ProxyServlet;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.select.Elements;
+
+/**
+ * Rewrite of the url-proxy servlet (YaCyProxyServlet "/proxy.html?url=xyz")
+ * using different rewrite of url methode (using JSoup instead of regex for more flexibility)
+ * (problem with regex was to also modify http header tags, causing problems with some relative link urls
+ * and on included header tag)
+ *
+ * Design goal of this urlproxy
+ * - option to handle links/urls the owner/user clicked on
+ * - index visited pages on the fly (without to configure a permanent "transparent" proxy
+ *
+ * For the goal and as distinguish from the "transparent" proxy we don't want (and need) to route all content
+ * through the proxy (e.g. we are not interested in transporting css etc. but concentrate on searcheable content.
+ *
+ * general functionallity to implement
+ * 1 - check user access right
+ * 2 - get target url from parameter
+ * 3 - check target url accepteable
+ * 4 - get target url
+ * 5 - index target url
+ * 6 - perform any custom event/treatment (for/on this user clicked url) - not implemented
+ * 7 - modify loaded target content (like rewrite links to get proxied)
+ * 8 - optionally add augmentation / interaction - not implemented
+ * 9 - deliver to client broser
+ *
+ * The rewrite of links can't be perfect, as all kinds of scripting etc. can be involved,
+ * with jsoup only the attributes of the body are modified. What will help to display
+ * the page correct but will also results that e.g. with forms and javascript menues links will not
+ * point to the original site (instead to the proxy url)
+ *
+ * TODO: instead of using JSoup on top the (2 time parsing - for indexing & content rewrite) check option to joined parsing steps
+ *
+ * Hint: a browser favorite of
+ * javascript: window.location.href = ('http://localhost:9090/proxy.html?url=' + location.href);
+ * will start the urlproxy with the current broser address.
+ */
+public class UrlProxyServlet extends ProxyServlet implements Servlet {
+
+ @Override
+ public void init(ServletConfig config) throws ServletException {
+ super.init(config);
+
+ // must be lower case (header names are internally converted to lower)
+ _DontProxyHeaders.add("host"); // to prevent Host header setting from original servletrequest (which is localhost)
+
+ }
+ /* ------------------------------------------------------------ */
+
+ @Override
+ public void service (ServletRequest req, ServletResponse res) throws ServletException, IOException {
+
+ final HttpServletRequest request = (HttpServletRequest) req;
+ final HttpServletResponse response = (HttpServletResponse) res;
+
+ // 1 - check usser access rights
+ if (!Switchboard.getSwitchboard().getConfigBool("proxyURL", false)) {
+ response.sendError(HttpServletResponse.SC_FORBIDDEN,"proxy use not allowed. URL proxy globally switched off (see: Content Semantic -> Augmented Browsing -> URL proxy)");
+ return;
+ }
+
+ final String remoteHost = req.getRemoteHost();
+ if (!Domains.isThisHostIP(remoteHost)) {
+ if (!proxyippatternmatch(remoteHost)) {
+ response.sendError(HttpServletResponse.SC_FORBIDDEN,
+ "proxy use not granted for IP " + remoteHost + " (see: Content Semantic -> Augmented Browsing -> Restrict URL proxy use filter)");
+ return;
+ }
+ }
+
+ if ("CONNECT".equalsIgnoreCase(request.getMethod())) {
+ handleConnect(request, response);
+ } else {
+
+ final Continuation continuation = ContinuationSupport.getContinuation(request);
+
+ if (!continuation.isInitial()) {
+ response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial
+ return;
+ }
+ // 2 - get target url
+ URL proxyurl = null;
+ String strARGS = request.getQueryString();
+ if (strARGS == null) {
+ response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
+ return;
+ }
+
+ if (strARGS.startsWith("url=")) {
+ final String strUrl = strARGS.substring(4); // strip "url="
+
+ try {
+ proxyurl = new URL(strUrl);
+ } catch (final MalformedURLException e) {
+ proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name()));
+
+ }
+ }
+ if (proxyurl == null) {
+ response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
+ return;
+ }
+
+ String hostwithport = proxyurl.getHost();
+ if (proxyurl.getPort() != -1) {
+ hostwithport += ":" + proxyurl.getPort();
+ }
+ // 4 - get target url
+ RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request);
+ yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE);
+ yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH);
+
+ final HashMap prop = new HashMap();
+ prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1);
+ prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport);
+ prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20"));
+ if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery());
+ prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST);
+
+ yacyRequestHeader.put(HeaderFramework.HOST, hostwithport );
+ yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath());
+
+ // 4 & 5 get & index target url
+ final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream();
+ HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent);
+
+ // reparse header to extract content-length and mimetype
+ final ResponseHeader proxyResponseHeader = new ResponseHeader(200); //
+ InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray());
+ String line = readLine(proxyout);
+ while (line != null && !line.equals("")) {
+ int p;
+ if ((p = line.indexOf(':')) >= 0) {
+ // store a property
+ proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim());
+ }
+ line = readLine(proxyout);
+ }
+ if (line == null) {
+ response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing");
+ return;
+ }
+
+ if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) {
+ // rewrite location header
+ String location = proxyResponseHeader.get(HeaderFramework.LOCATION);
+ if (location.startsWith("http")) {
+ location = request.getServletPath() + "?url=" + location;
+ } else {
+ location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location;
+ }
+ response.addHeader(HeaderFramework.LOCATION, location);
+ }
+
+ final int httpStatus = proxyResponseHeader.getStatusCode();
+ final String mimeType = proxyResponseHeader.getContentType();
+ response.setStatus(httpStatus);
+ response.setContentType(mimeType);
+
+ if ((httpStatus) == 200 &&(mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) {
+ if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
+ proxyout = new ChunkedInputStream(proxyout);
+ }
+
+ // 7 - modify target content
+ final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url=";
+ Document doc;
+ try {
+ doc = Jsoup.parse(proxyout, "UTF-8", proxyurl.toString());
+ } catch (Exception eio) {
+ response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString());
+ return;
+ }
+ Element hd = doc.head();
+ if (hd != null) {
+ // add a base url if not exist (to make sure relative links point to original)
+ Elements basetags = hd.getElementsByTag("base");
+ if (basetags.isEmpty()) {
+ Element newbasetag = hd.prependElement("base");
+ String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory;
+ newbasetag.attr("href", basestr);
+ }
+ }
+
+ Element bde = doc.body(); // start with body element to rewrite href links
+ // rewrite all href with abs proxy url (must be abs because of head tag
+ Elements taglist = bde.getElementsByAttribute("href");
+ final Switchboard sb = Switchboard.getSwitchboard();
+ for (Element e : taglist) {
+ if (e.tagName().equals("a")) { // get tag
+ String absurl = e.absUrl("href"); // get href attribut as abs url
+ if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) {
+ continue;
+ } else {
+ if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
+ try {
+ if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) {
+ continue;
+ }
+ } catch (MalformedURLException ex) {
+ ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl);
+ continue;
+ }
+ }
+ e.attr("href", servletstub + absurl); // rewrite with abs proxy-url
+ }
+ }
+ }
+
+ // 8 - add interaction elements (e.g. proxy exit button to switch back to original url)
+ // TODO: use a template file for
+ //de.prepend("");
+
+ // 9 - deliver to client
+ byte[] sbb = UTF8.getBytes(doc.toString());
+
+ // add some proxy-headers to response header
+ if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) {
+ response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER));
+ }
+ if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) {
+ response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE));
+ }
+ if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
+ response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED));
+ }
+ if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) {
+ response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES));
+ }
+
+ response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
+ FileUtils.copy (sbb,response.getOutputStream());
+
+ } else {
+ if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) {
+ response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE));
+ }
+ FileUtils.copy(proxyout, response.getOutputStream());
+ }
+ }
+ }
+
+ private String readLine(final InputStream in) throws IOException {
+ final ByteArrayOutputStream buf = new ByteArrayOutputStream();
+ int b;
+ while ((b = in.read()) != '\r' && b != -1) {
+ buf.write(b);
+ }
+ if (b == -1) {
+ return null;
+ }
+ b = in.read(); // read \n
+ if (b == -1) {
+ return null;
+ }
+ return buf.toString("UTF-8");
+ }
+
+ /**
+ * helper for proxy IP config pattern check
+ */
+ private boolean proxyippatternmatch(final String key) {
+ // the cfgippattern is a comma-separated list of patterns
+ // each pattern may contain one wildcard-character '*' which matches anything
+ final String cfgippattern = Switchboard.getSwitchboard().getConfig("proxyURL.access", "*");
+ if (cfgippattern.equals("*")) {
+ return true;
+ }
+ final StringTokenizer st = new StringTokenizer(cfgippattern, ",");
+ String pattern;
+ while (st.hasMoreTokens()) {
+ pattern = st.nextToken();
+ if (key.matches(pattern)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * get destination url (from query parameter &url=http://....)
+ * override to prevent calculating destination url from request
+ *
+ * @param request
+ * @param uri not used
+ * @return destination url from query parameter &url=_destinationurl_
+ * @throws MalformedURLException
+ */
+ @Override
+ protected HttpURI proxyHttpURI(HttpServletRequest request, String uri) throws MalformedURLException {
+ String strARGS = request.getQueryString();
+ if (strARGS.startsWith("url=")) {
+ final String strUrl = strARGS.substring(4); // strip url=
+
+ try {
+ URL newurl = new URL(strUrl);
+ int port = newurl.getPort();
+ if (port < 1) {
+ port = newurl.getDefaultPort();
+ }
+ return proxyHttpURI(newurl.getProtocol(), newurl.getHost(), port, newurl.getPath());
+ } catch (final MalformedURLException e) {
+ ConcurrentLog.fine("PROXY", "url parameter missing");
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public String getServletInfo() {
+ return "YaCy Proxy Servlet";
+ }
+
+}
diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java
index 64542eeae..b1ee93a5a 100644
--- a/source/net/yacy/http/servlets/YaCyProxyServlet.java
+++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java
@@ -143,14 +143,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent);
// reparse header to extract content-length and mimetype
- final ResponseHeader outgoingHeader = new ResponseHeader(200); //
+ final ResponseHeader proxyResponseHeader = new ResponseHeader(200); //
final InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray());
String line = readLine(proxyout);
while (line != null && !line.equals("")) {
int p;
if ((p = line.indexOf(':')) >= 0) {
// store a property
- outgoingHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim());
+ proxyResponseHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim());
}
line = readLine(proxyout);
}
@@ -177,11 +177,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
response.addHeader(HeaderFramework.LOCATION, location);
}
- final String mimeType = outgoingHeader.getContentType();
+ final String mimeType = proxyResponseHeader.getContentType();
+ response.setContentType(mimeType);
+ response.setStatus(httpStatus);
+
if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) {
final StringWriter buffer = new StringWriter();
- if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && outgoingHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
+ if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
FileUtils.copy(new ChunkedInputStream(proxyout), buffer, UTF8.charset);
} else {
FileUtils.copy(proxyout, buffer, UTF8.charset);
@@ -267,29 +270,27 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
byte[] sbb = UTF8.getBytes(result.toString());
// add some proxy-headers to response header
- response.setContentType(outgoingHeader.getContentType());
- if (outgoingHeader.containsKey(HeaderFramework.SERVER)) {
- response.addHeader(HeaderFramework.SERVER, outgoingHeader.get(HeaderFramework.SERVER));
+ response.setContentType(proxyResponseHeader.getContentType());
+ if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) {
+ response.addHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER));
}
- if (outgoingHeader.containsKey(HeaderFramework.DATE)) {
- response.addHeader(HeaderFramework.DATE, outgoingHeader.get(HeaderFramework.DATE));
+ if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) {
+ response.addHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE));
}
- if (outgoingHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
- response.addHeader(HeaderFramework.LAST_MODIFIED, outgoingHeader.get(HeaderFramework.LAST_MODIFIED));
+ if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
+ response.addHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED));
}
- if (outgoingHeader.containsKey(HeaderFramework.EXPIRES)) {
- response.addHeader(HeaderFramework.EXPIRES, outgoingHeader.get(HeaderFramework.EXPIRES));
+ if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) {
+ response.addHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES));
}
- response.setStatus(httpStatus);
- response.addIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
+ response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
response.getOutputStream().write(sbb);
} else {
if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) {
response.addHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE));
- }
- response.setStatus(httpStatus);
+ }
FileUtils.copy(proxyout, response.getOutputStream());
}
}
diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java
index d71fefaaa..f1334f417 100644
--- a/source/net/yacy/server/http/HTTPDProxyHandler.java
+++ b/source/net/yacy/server/http/HTTPDProxyHandler.java
@@ -699,7 +699,6 @@ public final class HTTPDProxyHandler {
} finally {
try { respond.flush(); respond.close(); } catch (final Exception e) {}
}
- return;
}
/**