|
|
|
package net.yacy.http.servlets;
|
|
|
|
|
|
|
|
import java.io.ByteArrayInputStream;
|
|
|
|
import java.io.ByteArrayOutputStream;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.net.MalformedURLException;
|
|
|
|
import java.net.URL;
|
|
|
|
import java.net.URLDecoder;
|
|
|
|
import java.nio.charset.StandardCharsets;
|
|
|
|
import java.util.HashMap;
|
|
|
|
import java.util.regex.PatternSyntaxException;
|
|
|
|
import javax.servlet.Servlet;
|
|
|
|
import javax.servlet.ServletConfig;
|
|
|
|
import javax.servlet.ServletException;
|
|
|
|
import javax.servlet.ServletRequest;
|
|
|
|
import javax.servlet.ServletResponse;
|
|
|
|
import javax.servlet.http.HttpServlet;
|
|
|
|
import javax.servlet.http.HttpServletRequest;
|
|
|
|
import javax.servlet.http.HttpServletResponse;
|
|
|
|
import net.yacy.cora.document.encoding.UTF8;
|
|
|
|
import net.yacy.cora.document.id.DigestURL;
|
|
|
|
import net.yacy.cora.protocol.ClientIdentification;
|
|
|
|
import net.yacy.cora.protocol.Domains;
|
|
|
|
import net.yacy.cora.protocol.HeaderFramework;
|
|
|
|
import net.yacy.cora.protocol.RequestHeader;
|
|
|
|
import net.yacy.cora.protocol.ResponseHeader;
|
|
|
|
import net.yacy.cora.util.ConcurrentLog;
|
|
|
|
import net.yacy.kelondro.util.FileUtils;
|
|
|
|
import net.yacy.search.Switchboard;
|
|
|
|
import net.yacy.server.http.ChunkedInputStream;
|
|
|
|
import net.yacy.server.http.HTTPDProxyHandler;
|
|
|
|
import org.eclipse.jetty.continuation.Continuation;
|
|
|
|
import org.eclipse.jetty.continuation.ContinuationSupport;
|
|
|
|
import org.jsoup.Jsoup;
|
|
|
|
import org.jsoup.nodes.Document;
|
|
|
|
import org.jsoup.nodes.Element;
|
|
|
|
import org.jsoup.select.Elements;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Rewrite of the url-proxy servlet (YaCyProxyServlet "/proxy.html?url=xyz")
|
|
|
|
* using different rewrite of url methode (using JSoup instead of regex for more flexibility)
|
|
|
|
* (problem with regex was to also modify http header tags, causing problems with some relative link urls
|
|
|
|
* and on included <base> header tag)
|
|
|
|
*
|
|
|
|
* Design goal of this urlproxy
|
|
|
|
* - option to handle links/urls the owner/user clicked on
|
|
|
|
* - index visited pages on the fly (without to configure a permanent "transparent" proxy
|
|
|
|
*
|
|
|
|
* For the goal and as distinguish from the "transparent" proxy we don't want (and need) to route all content
|
|
|
|
* through the proxy (e.g. we are not interested in transporting css etc. but concentrate on searcheable content.
|
|
|
|
*
|
|
|
|
* general functionallity to implement
|
|
|
|
* 1 - check user access right
|
|
|
|
* 2 - get target url from parameter
|
|
|
|
* 3 - check target url accepteable
|
|
|
|
* 4 - get target url
|
|
|
|
* 5 - index target url
|
|
|
|
* 6 - perform any custom event/treatment (for/on this user clicked url) - not implemented
|
|
|
|
* 7 - modify loaded target content (like rewrite links to get proxied)
|
|
|
|
* 8 - optionally add augmentation / interaction - not implemented
|
|
|
|
* 9 - deliver to client broser
|
|
|
|
*
|
|
|
|
* The rewrite of links can't be perfect, as all kinds of scripting etc. can be involved,
|
|
|
|
* with jsoup only the <a href /> attributes of the body are modified. What will help to display
|
|
|
|
* the page correct but will also results that e.g. with forms and javascript menues links will not
|
|
|
|
* point to the original site (instead to the proxy url)
|
|
|
|
*
|
|
|
|
* TODO: instead of using JSoup on top the (2 time parsing - for indexing & content rewrite) check option to joined parsing steps
|
|
|
|
*
|
|
|
|
* Hint: a browser favorite of
|
|
|
|
* javascript: window.location.href = ('http://localhost:9090/proxy.html?url=' + location.href);
|
|
|
|
* will start the urlproxy with the current browser address.
|
|
|
|
*
|
|
|
|
* This class is linked to YaCy within jetty using the defaults/web.xml configuration
|
|
|
|
*/
|
|
|
|
public class UrlProxyServlet extends HttpServlet implements Servlet {
|
|
|
|
private static final long serialVersionUID = 4900000000000001121L;
|
|
|
|
private String _stopProxyText = null;
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void init(ServletConfig config) throws ServletException {
|
|
|
|
super.init(config);
|
|
|
|
|
|
|
|
String tmps = config.getInitParameter("stopProxyText");
|
|
|
|
if (tmps != null) {
|
|
|
|
_stopProxyText = tmps;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
/* ------------------------------------------------------------ */
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public void service (ServletRequest req, ServletResponse res) throws ServletException, IOException {
|
|
|
|
|
|
|
|
final HttpServletRequest request = (HttpServletRequest) req;
|
|
|
|
final HttpServletResponse response = (HttpServletResponse) res;
|
|
|
|
|
|
|
|
// 1 - check usser access rights
|
|
|
|
if (!Switchboard.getSwitchboard().getConfigBool("proxyURL", false)) {
|
|
|
|
response.sendError(HttpServletResponse.SC_FORBIDDEN,"proxy use not allowed. URL proxy globally switched off (see: Content Semantic -> Augmented Browsing -> URL proxy)");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
final String remoteHost = req.getRemoteHost();
|
|
|
|
if (!Domains.isThisHostIP(remoteHost)) {
|
|
|
|
if (!proxyippatternmatch(remoteHost)) {
|
|
|
|
response.sendError(HttpServletResponse.SC_FORBIDDEN,
|
|
|
|
"proxy use not granted for IP " + remoteHost + " (see: Content Semantic -> Augmented Browsing -> Restrict URL proxy use filter)");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if ("CONNECT".equalsIgnoreCase(request.getMethod())) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
final Continuation continuation = ContinuationSupport.getContinuation(request);
|
|
|
|
|
|
|
|
if (!continuation.isInitial()) {
|
|
|
|
response.sendError(HttpServletResponse.SC_GATEWAY_TIMEOUT); // Need better test that isInitial
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
// 2 - get target url
|
|
|
|
URL proxyurl = null;
|
|
|
|
final String strUrl = request.getParameter("url");
|
|
|
|
if (strUrl == null) {
|
|
|
|
response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
proxyurl = new URL(strUrl);
|
|
|
|
} catch (final MalformedURLException e) {
|
|
|
|
proxyurl = new URL(URLDecoder.decode(strUrl, StandardCharsets.UTF_8.name()));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (proxyurl == null) {
|
|
|
|
response.sendError(HttpServletResponse.SC_NOT_FOUND,"url parameter missing");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
String hostwithport = proxyurl.getHost();
|
|
|
|
if (proxyurl.getPort() != -1) {
|
|
|
|
hostwithport += ":" + proxyurl.getPort();
|
|
|
|
}
|
|
|
|
// 4 - get target url
|
|
|
|
RequestHeader yacyRequestHeader = YaCyDefaultServlet.convertHeaderFromJetty(request);
|
|
|
|
yacyRequestHeader.remove(RequestHeader.KEEP_ALIVE);
|
|
|
|
yacyRequestHeader.remove(HeaderFramework.CONTENT_LENGTH);
|
|
|
|
|
|
|
|
final HashMap<String, Object> prop = new HashMap<String, Object>();
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_HTTP_VER, HeaderFramework.HTTP_VERSION_1_1);
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_PROTOCOL, proxyurl.getProtocol());
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_HOST, hostwithport);
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath().replaceAll(" ", "%20"));
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_METHOD, request.getMethod()); // only needed for HTTPDeamon errormsg in case of blacklisted url
|
|
|
|
if (proxyurl.getQuery() != null) prop.put(HeaderFramework.CONNECTION_PROP_ARGS, proxyurl.getQuery());
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, Domains.LOCALHOST);
|
|
|
|
prop.put(HeaderFramework.CONNECTION_PROP_CLIENT_HTTPSERVLETREQUEST, request);
|
|
|
|
|
|
|
|
yacyRequestHeader.put(HeaderFramework.HOST, hostwithport );
|
|
|
|
yacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, proxyurl.getPath());
|
|
|
|
|
|
|
|
// 4 & 5 get & index target url
|
|
|
|
final ByteArrayOutputStream tmpproxyout = new ByteArrayOutputStream();
|
|
|
|
HTTPDProxyHandler.doGet(prop, yacyRequestHeader, tmpproxyout, ClientIdentification.yacyProxyAgent);
|
|
|
|
|
|
|
|
// reparse header to extract content-length and mimetype
|
|
|
|
final ResponseHeader proxyResponseHeader = new ResponseHeader(200); //
|
|
|
|
InputStream proxyout = new ByteArrayInputStream(tmpproxyout.toByteArray());
|
|
|
|
String line = readLine(proxyout);
|
|
|
|
while (line != null && !line.equals("")) {
|
|
|
|
int p;
|
|
|
|
if ((p = line.indexOf(':')) >= 0) {
|
|
|
|
// store a property
|
|
|
|
proxyResponseHeader.put(line.substring(0, p).trim(), line.substring(p + 1).trim());
|
|
|
|
}
|
|
|
|
line = readLine(proxyout);
|
|
|
|
}
|
|
|
|
if (line == null) {
|
|
|
|
response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy Header missing");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.LOCATION)) {
|
|
|
|
// rewrite location header
|
|
|
|
String location = proxyResponseHeader.get(HeaderFramework.LOCATION);
|
|
|
|
if (location.startsWith("http")) {
|
|
|
|
location = request.getServletPath() + "?url=" + location;
|
|
|
|
} else {
|
|
|
|
location = request.getServletPath() + "?url=http://" + hostwithport + "/" + location;
|
|
|
|
}
|
|
|
|
response.addHeader(HeaderFramework.LOCATION, location);
|
|
|
|
}
|
|
|
|
|
|
|
|
final int httpStatus = proxyResponseHeader.getStatusCode();
|
|
|
|
final String mimeType = proxyResponseHeader.getContentType();
|
|
|
|
response.setStatus(httpStatus);
|
|
|
|
response.setContentType(mimeType);
|
|
|
|
|
|
|
|
if ((httpStatus < HttpServletResponse.SC_BAD_REQUEST) && (mimeType != null) && mimeType.startsWith("text")) {
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING) && proxyResponseHeader.get(HeaderFramework.TRANSFER_ENCODING).contains("chunked")) {
|
|
|
|
proxyout = new ChunkedInputStream(proxyout);
|
|
|
|
}
|
|
|
|
|
|
|
|
// 7 - modify target content
|
|
|
|
final String servletstub = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + request.getServletPath() + "?url=";
|
|
|
|
Document doc;
|
|
|
|
try {
|
|
|
|
doc = Jsoup.parse(proxyout, proxyResponseHeader.getCharacterEncoding(), proxyurl.toString());
|
|
|
|
} catch (IOException eio) {
|
|
|
|
response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR,"Proxy: parser error on " + proxyurl.toString() +"\n\n"+ eio.getMessage());
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
Element bde = doc.body(); // start with body element to rewrite href links
|
|
|
|
// rewrite all href with abs proxy url (must be abs because of <base> head tag
|
|
|
|
Elements taglist = bde.getElementsByAttribute("href");
|
|
|
|
final Switchboard sb = Switchboard.getSwitchboard();
|
|
|
|
for (Element e : taglist) {
|
|
|
|
if (e.tagName().equals("a")) { // get <a> tag
|
|
|
|
String absurl = e.absUrl("href"); // get href attribut as abs url
|
|
|
|
if (absurl.startsWith("data:") || absurl.startsWith("#") || absurl.startsWith("mailto:") || absurl.startsWith("javascript:")) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (sb.getConfig("proxyURL.rewriteURLs", "all").equals("domainlist")) {
|
|
|
|
try {
|
|
|
|
if (sb.crawlStacker.urlInAcceptedDomain(new DigestURL(absurl)) != null) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
} catch (MalformedURLException ex) {
|
|
|
|
ConcurrentLog.fine("PROXY", "ProxyServlet: malformed url for url-rewirte " + absurl);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
e.attr("href", servletstub + absurl); // rewrite with abs proxy-url
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Element hd = doc.head();
|
|
|
|
if (hd != null) {
|
|
|
|
// add a base url if not exist (to make sure relative links point to original)
|
|
|
|
Elements basetags = hd.getElementsByTag("base");
|
|
|
|
if (basetags.isEmpty()) {
|
|
|
|
Element newbasetag = hd.prependElement("base");
|
|
|
|
String basestr = proxyurl.getProtocol() + "://" + hostwithport + proxyurl.getPath(); //+directory;
|
|
|
|
newbasetag.attr("href", basestr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// 8 - add interaction elements (e.g. proxy exit button to switch back to original url)
|
|
|
|
if (_stopProxyText != null) {
|
|
|
|
|
|
|
|
String httpsAlertMsg = "";
|
|
|
|
if (proxyurl.getProtocol().equalsIgnoreCase("https") && !request.getScheme().equalsIgnoreCase("https")) httpsAlertMsg = " - <span style='color:red'>(Warning: secure target viewed over normal http)</span>";
|
|
|
|
|
|
|
|
// use a template file, to allow full servlet functionallity header as iframe included
|
|
|
|
String hdrtemplate = request.getScheme() + "://" + request.getServerName() + ":" + request.getServerPort() + "/proxymsg/urlproxyheader.html?url=" + proxyurl.toString();
|
|
|
|
hdrtemplate = "<iframe src='" + hdrtemplate +"' width='98%' height='50px' >"
|
|
|
|
// alternative for no-frame supporting browser
|
|
|
|
+ "<div width='100%' style='padding:5px; background:white; border-bottom: medium solid lightgrey;'>"
|
|
|
|
+ "<div align='center' style='font-size:11px;'><a style='font-size:11px; color:black;' href='" + proxyurl + "'>" + _stopProxyText + "</a> "
|
|
|
|
+ httpsAlertMsg + "</div></div>"
|
|
|
|
+ "</iframe>";
|
|
|
|
bde.prepend(hdrtemplate); // put as 1st element in body
|
|
|
|
}
|
|
|
|
|
|
|
|
// 9 - deliver to client
|
|
|
|
byte[] sbb;
|
|
|
|
if (doc.charset() == null) {
|
|
|
|
sbb = UTF8.getBytes(doc.toString());
|
|
|
|
response.setCharacterEncoding(StandardCharsets.UTF_8.name());
|
|
|
|
} else { // keep orig charset
|
|
|
|
sbb = doc.toString().getBytes(doc.charset());
|
|
|
|
response.setCharacterEncoding(doc.charset().name());
|
|
|
|
}
|
|
|
|
// add some proxy-headers to response header
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.SERVER)) {
|
|
|
|
response.setHeader(HeaderFramework.SERVER, proxyResponseHeader.get(HeaderFramework.SERVER));
|
|
|
|
}
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.DATE)) {
|
|
|
|
response.setHeader(HeaderFramework.DATE, proxyResponseHeader.get(HeaderFramework.DATE));
|
|
|
|
}
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.LAST_MODIFIED)) {
|
|
|
|
response.setHeader(HeaderFramework.LAST_MODIFIED, proxyResponseHeader.get(HeaderFramework.LAST_MODIFIED));
|
|
|
|
}
|
|
|
|
if (proxyResponseHeader.containsKey(HeaderFramework.EXPIRES)) {
|
|
|
|
response.setHeader(HeaderFramework.EXPIRES, proxyResponseHeader.get(HeaderFramework.EXPIRES));
|
|
|
|
}
|
|
|
|
|
|
|
|
response.setIntHeader(HeaderFramework.CONTENT_LENGTH, sbb.length);
|
|
|
|
response.getOutputStream().write(sbb);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
if (httpStatus >= HttpServletResponse.SC_BAD_REQUEST) {
|
|
|
|
if (HeaderFramework.http1_1.containsKey(Integer.toString(httpStatus))) {
|
|
|
|
//http1_1 includes http1_0 messages
|
|
|
|
final String httpStatusText = HeaderFramework.http1_1.get(Integer.toString(httpStatus));
|
|
|
|
response.sendError(httpStatus, "Site " + proxyurl + " returned with status " + httpStatus + " (" + httpStatusText + ")");
|
|
|
|
} else {
|
|
|
|
response.sendError(httpStatus, "Site " + proxyurl + " returned with status " + httpStatus);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if ((response.getHeader(HeaderFramework.CONTENT_LENGTH) == null) && prop.containsKey(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE)) {
|
|
|
|
response.setHeader(HeaderFramework.CONTENT_LENGTH, (String) prop.get(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_SIZE));
|
|
|
|
}
|
|
|
|
FileUtils.copy(proxyout, response.getOutputStream());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private String readLine(final InputStream in) throws IOException {
|
|
|
|
final ByteArrayOutputStream buf = new ByteArrayOutputStream();
|
|
|
|
int b;
|
|
|
|
while ((b = in.read()) != '\r' && b != -1) {
|
|
|
|
buf.write(b);
|
|
|
|
}
|
|
|
|
if (b == -1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
b = in.read(); // read \n
|
|
|
|
if (b == -1) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
return buf.toString(StandardCharsets.UTF_8.name());
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* helper for proxy IP config pattern check
|
|
|
|
*/
|
|
|
|
private boolean proxyippatternmatch(final String key) {
|
|
|
|
// the cfgippattern is a comma-separated list of patterns
|
|
|
|
// each pattern may contain one wildcard-character '*' which matches anything
|
|
|
|
final String[] cfgippattern = Switchboard.getSwitchboard().getConfigArray("proxyURL.access", "*");
|
|
|
|
if (cfgippattern[0].equals("*")) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
for (String pattern : cfgippattern) {
|
|
|
|
try {
|
|
|
|
if (key.matches(pattern)) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
} catch (PatternSyntaxException ex) {
|
|
|
|
ConcurrentLog.warn("PROXY", "wrong ip pattern in url proxy config " + ex.getMessage() );
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public String getServletInfo() {
|
|
|
|
return "YaCy Proxy Servlet";
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|