From b12200cafe77e03b5e209ca64013cb7fdc428122 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 30 Mar 2014 04:04:02 +0200 Subject: [PATCH] alternative UrlProxyServlet (for /proxy.html) using different url rewrite rules - use JSoup parser for selective rewrite of html body tag - tags with src attribute are taken from original location (like css) improving display and are not routed trough the indexer Disadvantage: scripting links will drop out of proxy Setting of the servlet through web.xml exclusivly (in case one would like to quickly switch back to the YaCyProxyServlet, leaving the existing code of YaCyProxyServlet untouched available) --- defaults/web.xml | 5 + .../net/yacy/http/Jetty8HttpServerImpl.java | 3 +- .../yacy/http/servlets/UrlProxyServlet.java | 352 ++++++++++++++++++ .../yacy/http/servlets/YaCyProxyServlet.java | 35 +- .../yacy/server/http/HTTPDProxyHandler.java | 1 - 5 files changed, 376 insertions(+), 20 deletions(-) create mode 100644 source/net/yacy/http/servlets/UrlProxyServlet.java diff --git a/defaults/web.xml b/defaults/web.xml index 68e84d995..6c8b409bb 100644 --- a/defaults/web.xml +++ b/defaults/web.xml @@ -42,7 +42,11 @@ URLProxyServlet + + net.yacy.http.servlets.UrlProxyServlet @@ -61,6 +65,7 @@ URLProxyServlet /proxy.html + /proxy diff --git a/source/net/yacy/http/Jetty8HttpServerImpl.java b/source/net/yacy/http/Jetty8HttpServerImpl.java index 56c42ef3d..8e9b3c00e 100644 --- a/source/net/yacy/http/Jetty8HttpServerImpl.java +++ b/source/net/yacy/http/Jetty8HttpServerImpl.java @@ -44,7 +44,6 @@ import net.yacy.http.servlets.GSAsearchServlet; import net.yacy.http.servlets.SolrSelectServlet; import net.yacy.http.servlets.SolrServlet; import net.yacy.http.servlets.YaCyDefaultServlet; -import net.yacy.http.servlets.YaCyProxyServlet; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.utils.PKCS12Tool; @@ -150,7 +149,7 @@ public class Jetty8HttpServerImpl implements YaCyHttpServer { htrootContext.addServlet(SolrServlet.class, "/solr/webgraph/admin/luke"); // add proxy?url= servlet - htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html"); + //htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html"); // add GSA servlet htrootContext.addServlet(GSAsearchServlet.class,"/gsa/search"); diff --git a/source/net/yacy/http/servlets/UrlProxyServlet.java b/source/net/yacy/http/servlets/UrlProxyServlet.java new file mode 100644 index 000000000..bc509963d --- /dev/null +++ b/source/net/yacy/http/servlets/UrlProxyServlet.java @@ -0,0 +1,352 @@ +package net.yacy.http.servlets; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URL; +import java.net.URLDecoder; +import java.util.HashMap; +import java.util.StringTokenizer; +import javax.servlet.Servlet; +import javax.servlet.ServletConfig; +import javax.servlet.ServletException; +import javax.servlet.ServletRequest; +import javax.servlet.ServletResponse; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import net.yacy.cora.document.encoding.UTF8; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.protocol.ClientIdentification; +import net.yacy.cora.protocol.Domains; +import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.protocol.ResponseHeader; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.http.ProxyHandler; +import net.yacy.kelondro.util.FileUtils; +import net.yacy.search.Switchboard; +import net.yacy.server.http.ChunkedInputStream; +import net.yacy.server.http.HTTPDProxyHandler; +import org.eclipse.jetty.continuation.Continuation; +import org.eclipse.jetty.continuation.ContinuationSupport; +import org.eclipse.jetty.http.HttpURI; +import org.eclipse.jetty.servlets.ProxyServlet; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; + +/** + * Rewrite of the url-proxy servlet (YaCyProxyServlet "/proxy.html?url=xyz") + * using different rewrite of url methode (using JSoup instead of regex for more flexibility) + * (problem with regex was to also modify http header tags, causing problems with some relative link urls + * and on included header tag) + * + * Design goal of this urlproxy + * - option to handle links/urls the owner/user clicked on + * - index visited pages on the fly (without to configure a permanent "transparent" proxy + * + * For the goal and as distinguish from the "transparent" proxy we don't want (and need) to route all content + * through the proxy (e.g. we are not interested in transporting css etc. but concentrate on searcheable content. + * + * general functionallity to implement + * 1 - check user access right + * 2 - get target url from parameter + * 3 - check target url accepteable + * 4 - get target url + * 5 - index target url + * 6 - perform any custom event/treatment (for/on this user clicked url) - not implemented + * 7 - modify loaded target content (like rewrite links to get proxied) + * 8 - optionally add augmentation / interaction - not implemented + * 9 - deliver to client broser + * + * The rewrite of links can't be perfect, as all kinds of scripting etc. can be involved, + * with jsoup only the attributes of the body are modified. What will help to display + * the page correct but will also results that e.g. with forms and javascript menues links will not + * point to the original site (instead to the proxy url) + * + * TODO: instead of using JSoup on top the (2 time parsing - for indexing & content rewrite) check option to joined parsing steps + * + * Hint: a browser favorite of + * javascript: window.location.href = ('http://localhost:9090/proxy.html?url=' + location.href); + * will start the urlproxy with the current broser address. + */ +public class UrlProxyServlet extends ProxyServlet implements Servlet { + + @Override + public void init(ServletConfig config) throws ServletException { + super.init(config); + + // must be lower case (header names are internally converted to lower) + _DontProxyHeaders.add("host"); // to prevent Host header setting from original servletrequest (which is localhost) + + } + /* ------------------------------------------------------------ */ + + @Override + public void service (ServletRequest req, ServletResponse res) throws ServletException, IOException { + + final HttpServletRequest request = (HttpServletRequest) req; + final HttpServletResponse response = (HttpServletResponse) res; + + // 1 - check usser access rights + if (!Switchboard.getSwitchboard().getConfigBool("proxyURL", false)) { + response.sendError(HttpServletResponse.SC_FORBIDDEN,"proxy use not allowed. URL proxy globally switched off (see: Content Semantic -> Augmented Browsing -> URL proxy)"); + return; + } + + final String remoteHost = req.getRemoteHost(); + if (!Domains.isThisHostIP(remoteHost)) { + if (!proxyippatternmatch(remoteHost)) { + response.sendError(HttpServletResponse.SC_FORBIDDEN, + "proxy use not granted for IP " + remoteHost + " (see: Content Semantic -> Augmented Browsing -> Restrict URL proxy use filter)"); + return; + } + } + + if ("CONNECT".equalsIgnoreCase(request.getMethod())) { + handleConnect(request, response); + } else { + + final Continuation continuation = ContinuationSupport.getContinuation(request); + + if (!continuation.isInitial()) { + response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial + return; + } + // 2 - get target url + URL proxyurl = null; + String strARGS = request.getQueryString(); + if (strARGS == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } + + if (strARGS.startsWith("url=")) { + final String strUrl = strARGS.substring(4); // strip "url=" + + try { + proxyurl = new URL(strUrl); + } catch (final MalformedURLException e) { + proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name())); + + } + } + if (proxyurl == null) { + response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing"); + return; + } + + String hostwithport = proxyurl.getHost(); + if (proxyurl.getPort() != -1) { + hostwithport += ":" + proxyurl.getPort(); + } + // 4 - get target url + RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request); + yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE); + yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH); + + final HashMap prop = new HashMap(); + prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1); + prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport); + prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20")); + if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery()); + prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST); + + yacyRequestHeader.put(HeaderFramework.HOST, hostwithport ); + yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath()); + + // 4 & 5 get & index target url + final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream(); + HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); + + // reparse header to extract content-length and mimetype + final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // + InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); + String line = readLine(proxyout); + while (line != null && !line.equals("")) { + int p; + if ((p = line.indexOf(':')) >= 0) { + // store a property + proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim()); + } + line = readLine(proxyout); + } + if (line == null) { + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing"); + return; + } + + if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) { + // rewrite location header + String location = proxyResponseHeader.get(HeaderFramework.LOCATION); + if (location.startsWith("http")) { + location = request.getServletPath() + "?url=" + location; + } else { + location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location; + } + response.addHeader(HeaderFramework.LOCATION, location); + } + + final int httpStatus = proxyResponseHeader.getStatusCode(); + final String mimeType = proxyResponseHeader.getContentType(); + response.setStatus(httpStatus); + response.setContentType(mimeType); + + if ((httpStatus) == 200 &&(mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) { + if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { + proxyout = new ChunkedInputStream(proxyout); + } + + // 7 - modify target content + final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url="; + Document doc; + try { + doc = Jsoup.parse(proxyout, "UTF-8", proxyurl.toString()); + } catch (Exception eio) { + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString()); + return; + } + Element hd = doc.head(); + if (hd != null) { + // add a base url if not exist (to make sure relative links point to original) + Elements basetags = hd.getElementsByTag("base"); + if (basetags.isEmpty()) { + Element newbasetag = hd.prependElement("base"); + String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory; + newbasetag.attr("href", basestr); + } + } + + Element bde = doc.body(); // start with body element to rewrite href links + // rewrite all href with abs proxy url (must be abs because of head tag + Elements taglist = bde.getElementsByAttribute("href"); + final Switchboard sb = Switchboard.getSwitchboard(); + for (Element e : taglist) { + if (e.tagName().equals("a")) { // get tag + String absurl = e.absUrl("href"); // get href attribut as abs url + if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) { + continue; + } else { + if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) { + try { + if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) { + continue; + } + } catch (MalformedURLException ex) { + ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl); + continue; + } + } + e.attr("href", servletstub + absurl); // rewrite with abs proxy-url + } + } + } + + // 8 - add interaction elements (e.g. proxy exit button to switch back to original url) + // TODO: use a template file for + //de.prepend("
"); + + // 9 - deliver to client + byte[] sbb = UTF8.getBytes(doc.toString()); + + // add some proxy-headers to response header + if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { + response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { + response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { + response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); + } + if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { + response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); + } + + response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); + FileUtils.copy (sbb,response.getOutputStream()); + + } else { + if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { + response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); + } + FileUtils.copy(proxyout, response.getOutputStream()); + } + } + } + + private String readLine(final InputStream in) throws IOException { + final ByteArrayOutputStream buf = new ByteArrayOutputStream(); + int b; + while ((b = in.read()) != '\r' && b != -1) { + buf.write(b); + } + if (b == -1) { + return null; + } + b = in.read(); // read \n + if (b == -1) { + return null; + } + return buf.toString("UTF-8"); + } + + /** + * helper for proxy IP config pattern check + */ + private boolean proxyippatternmatch(final String key) { + // the cfgippattern is a comma-separated list of patterns + // each pattern may contain one wildcard-character '*' which matches anything + final String cfgippattern = Switchboard.getSwitchboard().getConfig("proxyURL.access", "*"); + if (cfgippattern.equals("*")) { + return true; + } + final StringTokenizer st = new StringTokenizer(cfgippattern, ","); + String pattern; + while (st.hasMoreTokens()) { + pattern = st.nextToken(); + if (key.matches(pattern)) { + return true; + } + } + return false; + } + + /** + * get destination url (from query parameter &url=http://....) + * override to prevent calculating destination url from request + * + * @param request + * @param uri not used + * @return destination url from query parameter &url=_destinationurl_ + * @throws MalformedURLException + */ + @Override + protected HttpURI proxyHttpURI(HttpServletRequest request, String uri) throws MalformedURLException { + String strARGS = request.getQueryString(); + if (strARGS.startsWith("url=")) { + final String strUrl = strARGS.substring(4); // strip url= + + try { + URL newurl = new URL(strUrl); + int port = newurl.getPort(); + if (port < 1) { + port = newurl.getDefaultPort(); + } + return proxyHttpURI(newurl.getProtocol(), newurl.getHost(), port, newurl.getPath()); + } catch (final MalformedURLException e) { + ConcurrentLog.fine("PROXY", "url parameter missing"); + } + } + return null; + } + + @Override + public String getServletInfo() { + return "YaCy Proxy Servlet"; + } + +} diff --git a/source/net/yacy/http/servlets/YaCyProxyServlet.java b/source/net/yacy/http/servlets/YaCyProxyServlet.java index 64542eeae..b1ee93a5a 100644 --- a/source/net/yacy/http/servlets/YaCyProxyServlet.java +++ b/source/net/yacy/http/servlets/YaCyProxyServlet.java @@ -143,14 +143,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent); // reparse header to extract content-length and mimetype - final ResponseHeader outgoingHeader = new ResponseHeader(200); // + final ResponseHeader proxyResponseHeader = new ResponseHeader(200); // final InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray()); String line = readLine(proxyout); while (line != null && !line.equals("")) { int p; if ((p = line.indexOf(':')) >= 0) { // store a property - outgoingHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim()); + proxyResponseHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim()); } line = readLine(proxyout); } @@ -177,11 +177,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { response.addHeader(HeaderFramework.LOCATION, location); } - final String mimeType = outgoingHeader.getContentType(); + final String mimeType = proxyResponseHeader.getContentType(); + response.setContentType(mimeType); + response.setStatus(httpStatus); + if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) { final StringWriter buffer = new StringWriter(); - if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && outgoingHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { + if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) { FileUtils.copy(new ChunkedInputStream(proxyout), buffer, UTF8.charset); } else { FileUtils.copy(proxyout, buffer, UTF8.charset); @@ -267,29 +270,27 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet { byte[] sbb = UTF8.getBytes(result.toString()); // add some proxy-headers to response header - response.setContentType(outgoingHeader.getContentType()); - if (outgoingHeader.containsKey(HeaderFramework.SERVER)) { - response.addHeader(HeaderFramework.SERVER, outgoingHeader.get(HeaderFramework.SERVER)); + response.setContentType(proxyResponseHeader.getContentType()); + if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) { + response.addHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER)); } - if (outgoingHeader.containsKey(HeaderFramework.DATE)) { - response.addHeader(HeaderFramework.DATE, outgoingHeader.get(HeaderFramework.DATE)); + if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) { + response.addHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE)); } - if (outgoingHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { - response.addHeader(HeaderFramework.LAST_MODIFIED, outgoingHeader.get(HeaderFramework.LAST_MODIFIED)); + if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) { + response.addHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED)); } - if (outgoingHeader.containsKey(HeaderFramework.EXPIRES)) { - response.addHeader(HeaderFramework.EXPIRES, outgoingHeader.get(HeaderFramework.EXPIRES)); + if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) { + response.addHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES)); } - response.setStatus(httpStatus); - response.addIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); + response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length); response.getOutputStream().write(sbb); } else { if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) { response.addHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)); - } - response.setStatus(httpStatus); + } FileUtils.copy(proxyout, response.getOutputStream()); } } diff --git a/source/net/yacy/server/http/HTTPDProxyHandler.java b/source/net/yacy/server/http/HTTPDProxyHandler.java index d71fefaaa..f1334f417 100644 --- a/source/net/yacy/server/http/HTTPDProxyHandler.java +++ b/source/net/yacy/server/http/HTTPDProxyHandler.java @@ -699,7 +699,6 @@ public final class HTTPDProxyHandler { } finally { try { respond.flush(); respond.close(); } catch (final Exception e) {} } - return; } /**