alternative UrlProxyServlet (for /proxy.html) using different url rewrite rules

- use JSoup parser for selective rewrite of html body <a href=  links only,
instead of regex which rewrites also header href/src links
- this improves display of pages which use header <base> tag
- tags with src attribute are taken from original location (like css) improving display and are not routed trough the indexer
Disadvantage: scripting links will drop out of proxy

Setting of the servlet through web.xml exclusivly (in case one would like to quickly switch back to the YaCyProxyServlet,
leaving the existing code of YaCyProxyServlet untouched available)
pull/1/head
reger 11 years ago
parent 7f29eee9ac
commit b12200cafe

@ -42,7 +42,11 @@
<!-- servlet to provide searchresults via proxy and needed for Augmented Browsing -->
<servlet>
<servlet-name>URLProxyServlet</servlet-name>
<!-- replaced with alternative servlet with new url rewrite rules
you may replace the new with YaCyProxyServlet here, to switch back
<servlet-class>net.yacy.http.servlets.YaCyProxyServlet</servlet-class>
-->
<servlet-class>net.yacy.http.servlets.UrlProxyServlet</servlet-class>
</servlet>
<!-- mappings activated by the application - hardcoded (added here for completness) -->
@ -61,6 +65,7 @@
<servlet-mapping>
<servlet-name>URLProxyServlet</servlet-name>
<url-pattern>/proxy.html</url-pattern>
<url-pattern>/proxy</url-pattern>
</servlet-mapping>
<servlet-mapping>

@ -44,7 +44,6 @@ import net.yacy.http.servlets.GSAsearchServlet;
import net.yacy.http.servlets.SolrSelectServlet;
import net.yacy.http.servlets.SolrServlet;
import net.yacy.http.servlets.YaCyDefaultServlet;
import net.yacy.http.servlets.YaCyProxyServlet;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.utils.PKCS12Tool;
@ -150,7 +149,7 @@ public class Jetty8HttpServerImpl implements YaCyHttpServer {
htrootContext.addServlet(SolrServlet.class, "/solr/webgraph/admin/luke");
// add proxy?url= servlet
htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html");
//htrootContext.addServlet(YaCyProxyServlet.class,"/proxy.html");
// add GSA servlet
htrootContext.addServlet(GSAsearchServlet.class,"/gsa/search");

@ -0,0 +1,352 @@
package net.yacy.http.servlets;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.StringTokenizer;
import javax.servlet.Servlet;
import javax.servlet.ServletConfig;
import javax.servlet.ServletException;
import javax.servlet.ServletRequest;
import javax.servlet.ServletResponse;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.http.ProxyHandler;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.server.http.ChunkedInputStream;
import net.yacy.server.http.HTTPDProxyHandler;
import org.eclipse.jetty.continuation.Continuation;
import org.eclipse.jetty.continuation.ContinuationSupport;
import org.eclipse.jetty.http.HttpURI;
import org.eclipse.jetty.servlets.ProxyServlet;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Rewrite of the url-proxy servlet (YaCyProxyServlet "/proxy.html?url=xyz")
* using different rewrite of url methode (using JSoup instead of regex for more flexibility)
* (problem with regex was to also modify http header tags, causing problems with some relative link urls
* and on included <base> header tag)
*
* Design goal of this urlproxy
* - option to handle links/urls the owner/user clicked on
* - index visited pages on the fly (without to configure a permanent "transparent" proxy
*
* For the goal and as distinguish from the "transparent" proxy we don't want (and need) to route all content
* through the proxy (e.g. we are not interested in transporting css etc. but concentrate on searcheable content.
*
* general functionallity to implement
* 1 - check user access right
* 2 - get target url from parameter
* 3 - check target url accepteable
* 4 - get target url
* 5 - index target url
* 6 - perform any custom event/treatment (for/on this user clicked url) - not implemented
* 7 - modify loaded target content (like rewrite links to get proxied)
* 8 - optionally add augmentation / interaction - not implemented
* 9 - deliver to client broser
*
* The rewrite of links can't be perfect, as all kinds of scripting etc. can be involved,
* with jsoup only the <a href /> attributes of the body are modified. What will help to display
* the page correct but will also results that e.g. with forms and javascript menues links will not
* point to the original site (instead to the proxy url)
*
* TODO: instead of using JSoup on top the (2 time parsing - for indexing & content rewrite) check option to joined parsing steps
*
* Hint: a browser favorite of
* javascript: window.location.href = ('http://localhost:9090/proxy.html?url=' + location.href);
* will start the urlproxy with the current broser address.
*/
public class UrlProxyServlet extends ProxyServlet implements Servlet {
@Override
public void init(ServletConfig config) throws ServletException {
super.init(config);
// must be lower case (header names are internally converted to lower)
_DontProxyHeaders.add("host"); // to prevent Host header setting from original servletrequest (which is localhost)
}
/* ------------------------------------------------------------ */
@Override
public void service (ServletRequest req, ServletResponse res) throws ServletException, IOException {
final HttpServletRequest request = (HttpServletRequest) req;
final HttpServletResponse response = (HttpServletResponse) res;
// 1 - check usser access rights
if (!Switchboard.getSwitchboard().getConfigBool("proxyURL", false)) {
response.sendError(HttpServletResponse.SC_FORBIDDEN,"proxy use not allowed. URL proxy globally switched off (see: Content Semantic -> Augmented Browsing -> URL proxy)");
return;
}
final String remoteHost = req.getRemoteHost();
if (!Domains.isThisHostIP(remoteHost)) {
if (!proxyippatternmatch(remoteHost)) {
response.sendError(HttpServletResponse.SC_FORBIDDEN,
"proxy use not granted for IP " + remoteHost + " (see: Content Semantic -> Augmented Browsing -> Restrict URL proxy use filter)");
return;
}
}
if ("CONNECT".equalsIgnoreCase(request.getMethod())) {
handleConnect(request, response);
} else {
final Continuation continuation = ContinuationSupport.getContinuation(request);
if (!continuation.isInitial()) {
response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial
return;
}
// 2 - get target url
URL proxyurl = null;
String strARGS = request.getQueryString();
if (strARGS == null) {
response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
return;
}
if (strARGS.startsWith("url=")) {
final String strUrl = strARGS.substring(4); // strip "url="
try {
proxyurl = new URL(strUrl);
} catch (final MalformedURLException e) {
proxyurl = new URL(URLDecoder.decode(strUrl, UTF8.charset.name()));
}
}
if (proxyurl == null) {
response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
return;
}
String hostwithport = proxyurl.getHost();
if (proxyurl.getPort() != -1) {
hostwithport += ":" + proxyurl.getPort();
}
// 4 - get target url
RequestHeader yacyRequestHeader = ProxyHandler.convertHeaderFromJetty(request);
yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE);
yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH);
final HashMap<String, Object> prop = new HashMap<String, Object>();
prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1);
prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport);
prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20"));
if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery());
prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST);
yacyRequestHeader.put(HeaderFramework.HOST, hostwithport );
yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath());
// 4 & 5 get & index target url
final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream();
HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent);
// reparse header to extract content-length and mimetype
final ResponseHeader proxyResponseHeader = new ResponseHeader(200); //
InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray());
String line = readLine(proxyout);
while (line != null && !line.equals("")) {
int p;
if ((p = line.indexOf(':')) >= 0) {
// store a property
proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim());
}
line = readLine(proxyout);
}
if (line == null) {
response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing");
return;
}
if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) {
// rewrite location header
String location = proxyResponseHeader.get(HeaderFramework.LOCATION);
if (location.startsWith("http")) {
location = request.getServletPath() + "?url=" + location;
} else {
location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location;
}
response.addHeader(HeaderFramework.LOCATION, location);
}
final int httpStatus = proxyResponseHeader.getStatusCode();
final String mimeType = proxyResponseHeader.getContentType();
response.setStatus(httpStatus);
response.setContentType(mimeType);
if ((httpStatus) == 200 &&(mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) {
if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
proxyout = new ChunkedInputStream(proxyout);
}
// 7 - modify target content
final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url=";
Document doc;
try {
doc = Jsoup.parse(proxyout, "UTF-8", proxyurl.toString());
} catch (Exception eio) {
response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString());
return;
}
Element hd = doc.head();
if (hd != null) {
// add a base url if not exist (to make sure relative links point to original)
Elements basetags = hd.getElementsByTag("base");
if (basetags.isEmpty()) {
Element newbasetag = hd.prependElement("base");
String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory;
newbasetag.attr("href", basestr);
}
}
Element bde = doc.body(); // start with body element to rewrite href links
// rewrite all href with abs proxy url (must be abs because of <base> head tag
Elements taglist = bde.getElementsByAttribute("href");
final Switchboard sb = Switchboard.getSwitchboard();
for (Element e : taglist) {
if (e.tagName().equals("a")) { // get <a> tag
String absurl = e.absUrl("href"); // get href attribut as abs url
if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) {
continue;
} else {
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
try {
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) {
continue;
}
} catch (MalformedURLException ex) {
ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl);
continue;
}
}
e.attr("href", servletstub + absurl); // rewrite with abs proxy-url
}
}
}
// 8 - add interaction elements (e.g. proxy exit button to switch back to original url)
// TODO: use a template file for
//de.prepend("<div><form action='" + proxyurl + "'><input type='submit' value='YaCy stop proxy' /></form></div>");
// 9 - deliver to client
byte[] sbb = UTF8.getBytes(doc.toString());
// add some proxy-headers to response header
if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) {
response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER));
}
if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) {
response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE));
}
if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED));
}
if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) {
response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES));
}
response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
FileUtils.copy (sbb,response.getOutputStream());
} else {
if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) {
response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE));
}
FileUtils.copy(proxyout, response.getOutputStream());
}
}
}
private String readLine(final InputStream in) throws IOException {
final ByteArrayOutputStream buf = new ByteArrayOutputStream();
int b;
while ((b = in.read()) != '\r' && b != -1) {
buf.write(b);
}
if (b == -1) {
return null;
}
b = in.read(); // read \n
if (b == -1) {
return null;
}
return buf.toString("UTF-8");
}
/**
* helper for proxy IP config pattern check
*/
private boolean proxyippatternmatch(final String key) {
// the cfgippattern is a comma-separated list of patterns
// each pattern may contain one wildcard-character '*' which matches anything
final String cfgippattern = Switchboard.getSwitchboard().getConfig("proxyURL.access", "*");
if (cfgippattern.equals("*")) {
return true;
}
final StringTokenizer st = new StringTokenizer(cfgippattern, ",");
String pattern;
while (st.hasMoreTokens()) {
pattern = st.nextToken();
if (key.matches(pattern)) {
return true;
}
}
return false;
}
/**
* get destination url (from query parameter &url=http://....)
* override to prevent calculating destination url from request
*
* @param request
* @param uri not used
* @return destination url from query parameter &url=_destinationurl_
* @throws MalformedURLException
*/
@Override
protected HttpURI proxyHttpURI(HttpServletRequest request, String uri) throws MalformedURLException {
String strARGS = request.getQueryString();
if (strARGS.startsWith("url=")) {
final String strUrl = strARGS.substring(4); // strip url=
try {
URL newurl = new URL(strUrl);
int port = newurl.getPort();
if (port < 1) {
port = newurl.getDefaultPort();
}
return proxyHttpURI(newurl.getProtocol(), newurl.getHost(), port, newurl.getPath());
} catch (final MalformedURLException e) {
ConcurrentLog.fine("PROXY", "url parameter missing");
}
}
return null;
}
@Override
public String getServletInfo() {
return "YaCy Proxy Servlet";
}
}

@ -143,14 +143,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent);
// reparse header to extract content-length and mimetype
final ResponseHeader outgoingHeader = new ResponseHeader(200); //
final ResponseHeader proxyResponseHeader = new ResponseHeader(200); //
final InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray());
String line = readLine(proxyout);
while (line != null && !line.equals("")) {
int p;
if ((p = line.indexOf(':')) >= 0) {
// store a property
outgoingHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim());
proxyResponseHeader.add(line.substring(0, p).trim(), line.substring(p + 1).trim());
}
line = readLine(proxyout);
}
@ -177,11 +177,14 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
response.addHeader(HeaderFramework.LOCATION, location);
}
final String mimeType = outgoingHeader.getContentType();
final String mimeType = proxyResponseHeader.getContentType();
response.setContentType(mimeType);
response.setStatus(httpStatus);
if ((mimeType != null) && (mimeType.startsWith("text/html") || mimeType.startsWith("text"))) {
final StringWriter buffer = new StringWriter();
if (outgoingHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && outgoingHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
FileUtils.copy(new ChunkedInputStream(proxyout), buffer, UTF8.charset);
} else {
FileUtils.copy(proxyout, buffer, UTF8.charset);
@ -267,29 +270,27 @@ public class YaCyProxyServlet extends ProxyServlet implements Servlet {
byte[] sbb = UTF8.getBytes(result.toString());
// add some proxy-headers to response header
response.setContentType(outgoingHeader.getContentType());
if (outgoingHeader.containsKey(HeaderFramework.SERVER)) {
response.addHeader(HeaderFramework.SERVER, outgoingHeader.get(HeaderFramework.SERVER));
response.setContentType(proxyResponseHeader.getContentType());
if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) {
response.addHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER));
}
if (outgoingHeader.containsKey(HeaderFramework.DATE)) {
response.addHeader(HeaderFramework.DATE, outgoingHeader.get(HeaderFramework.DATE));
if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) {
response.addHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE));
}
if (outgoingHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
response.addHeader(HeaderFramework.LAST_MODIFIED, outgoingHeader.get(HeaderFramework.LAST_MODIFIED));
if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
response.addHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED));
}
if (outgoingHeader.containsKey(HeaderFramework.EXPIRES)) {
response.addHeader(HeaderFramework.EXPIRES, outgoingHeader.get(HeaderFramework.EXPIRES));
if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) {
response.addHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES));
}
response.setStatus(httpStatus);
response.addIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
response.getOutputStream().write(sbb);
} else {
if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) {
response.addHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE));
}
response.setStatus(httpStatus);
}
FileUtils.copy(proxyout, response.getOutputStream());
}
}

@ -699,7 +699,6 @@ public final class HTTPDProxyHandler {
} finally {
try { respond.flush(); respond.close(); } catch (final Exception e) {}
}
return;
}
/**

Loading…
Cancel
Save