From 09c4ee56a786a0927313d352abc314e46ef4d095 Mon Sep 17 00:00:00 2001 From: luccioman Date: Thu, 21 Dec 2017 18:41:32 +0100 Subject: [PATCH] Added optional https support for remote crawl and profile operations --- htroot/ViewProfile.java | 4 +- htroot/rct_p.java | 5 +- source/net/yacy/crawler/data/CrawlQueues.java | 4 +- source/net/yacy/peers/Protocol.java | 200 ++++++++++++------ source/net/yacy/search/Switchboard.java | 14 +- 5 files changed, 147 insertions(+), 80 deletions(-) diff --git a/htroot/ViewProfile.java b/htroot/ViewProfile.java index 965b85fdc..e1b1750b1 100644 --- a/htroot/ViewProfile.java +++ b/htroot/ViewProfile.java @@ -1,4 +1,4 @@ -// ViewProfile_p.java +// ViewProfile.java // ----------------------- // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 07.04.2005 on http://yacy.net @@ -113,7 +113,7 @@ public class ViewProfile { } // try to get the profile from remote peer - profile = Protocol.getProfile(seed); + profile = Protocol.getProfile(sb, seed); // if profile did not arrive, say that peer is disconnected if (profile == null) { diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 4fb381ac0..a27e3e27a 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -38,6 +38,7 @@ import net.yacy.peers.DHTSelection; import net.yacy.peers.Protocol; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -52,7 +53,9 @@ public class rct_p { if (post.containsKey("retrieve")) { final String peerhash = post.get("peer", null); final Seed seed = (peerhash == null) ? null : sb.peers.getConnected(peerhash); - final RSSFeed feed = (seed == null) ? null : Protocol.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000); + final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED, + SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT); + final RSSFeed feed = (seed == null) ? null : Protocol.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000, preferHttps); if (feed != null) { for (final Hit item: feed) { //System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate()); diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 35c244df1..0425ecc50 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -520,7 +520,9 @@ public class CrawlQueues { } // we know a peer which should provide remote crawl entries. load them now. - final RSSFeed feed = Protocol.queryRemoteCrawlURLs(this.sb.peers, seed, 60, 10000); + final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED, + SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT); + final RSSFeed feed = Protocol.queryRemoteCrawlURLs(this.sb.peers, seed, 60, 10000, preferHttps); if (feed == null || feed.isEmpty()) { // try again and ask another peer return remoteCrawlLoaderJob(); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index cd52b023e..92bbd9e96 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1,4 +1,4 @@ -// yacyClient.java +// Protocol.java // ------------------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de @@ -405,7 +405,8 @@ public final class Protocol { final SeedDB seedDB, final Seed target, final int maxCount, - final long maxTime) { + final long maxTime, + final boolean preferHttps) { // returns a list of if ( target == null ) { return null; @@ -430,15 +431,37 @@ public final class Protocol { // final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts); final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, (int) maxTime); RSSReader reader = null; - for (String ip: target.getIPs()) { + for (final String ip: target.getIPs()) { + MultiProtocolURL targetBaseURL = null; try { - final byte[] result = httpClient.POSTbytes(new MultiProtocolURL("http://" + target.getPublicAddress(ip) + "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true); + targetBaseURL = target.getPublicMultiprotocolURL(ip, preferHttps); + byte[] result; + try { + result = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true); + } catch(final IOException e) { + if(targetBaseURL.isHTTPS()) { + /* Failed with https : retry with http */ + targetBaseURL = target.getPublicMultiprotocolURL(ip, false); + result = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true); + if(result != null) { + /* Got something with http : mark peer SSL as unavailable on target peer */ + markSSLUnavailableOnPeer(seedDB, target, ip, "yacyClient.queryRemoteCrawlURLs"); + } + } else { + throw e; + } + } reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result); + } catch(MalformedURLException e) { + Network.log.warn("yacyClient.queryRemoteCrawlURLs malformed target URL for peer '" + target.getName() + + "' on address : " + ip); } catch (final IOException e ) { reader = null; + Network.log.warn("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null"); + } + if (reader != null) { + break; } - if (reader != null) break; - Network.log.warn("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null"); target.put(Seed.RCOUNT, "0"); seedDB.peerActions.interfaceDeparture(target, ip); } @@ -1218,13 +1241,13 @@ public final class Protocol { /* Thread still running : try also with interrupt*/ remoteRequest.interrupt(); } - Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + target.getPublicAddress(ip) + " does not answer (time-out)"); + Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + targetBaseURL + " does not answer (time-out)"); target.setFlagSolrAvailable(false || myseed); return -1; // give up, leave remoteRequest abandoned. } if (rsp[0] == null || docList[0] == null) { - Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + target.getPublicAddress(ip) + " returned null"); + Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + targetBaseURL + " returned null"); if(!myseed) { if(targetBaseURL.startsWith("https")) { /* First mark https unavailable on this peer before removing anything else */ @@ -1515,6 +1538,7 @@ public final class Protocol { } public static Map crawlReceipt( + final Switchboard sb, final Seed mySeed, final Seed target, final String process, @@ -1548,49 +1572,60 @@ public final class Protocol { // prepare request final String salt = crypt.randomSalt(); + + final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED, + SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT); - // determining target address - final String address = target.getPublicAddress(target.getIP()); - if ( address == null ) { - return null; - } - - // send request - try { - // prepare request - final Map parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt); - parts.put("process", UTF8.StringBody(process)); - parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash())))); - parts.put("result", UTF8.StringBody(result)); - parts.put("reason", UTF8.StringBody(reason)); - parts.put("wordh", UTF8.StringBody(wordhashes)); - final String lurlstr; - if (entry == null) { - lurlstr = ""; - } else { - final ArrayList ldesc = entry.getDescription(); - if (ldesc.isEmpty()) { - lurlstr = entry.toString(); - } else { // add document abstract/description as snippet (remotely stored in description_txt) - lurlstr = entry.toString(ldesc.get(0)); - } - } - parts.put("lurlEntry", UTF8.StringBody(crypt.simpleEncode(lurlstr, salt))); - // send request - // final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts); - final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000); - final byte[] content = - httpClient.POSTbytes( - new MultiProtocolURL("http://" + address + "/yacy/crawlReceipt.html"), - target.getHexHash() + ".yacyh", - parts, - false, true); - return FileUtils.table(content); - } catch (final Exception e ) { - // most probably a network time-out exception - Network.log.warn("yacyClient.crawlReceipt error:" + e.getMessage()); - return null; - } + for (final String ip : target.getIPs()) { + // send request + try { + // prepare request + final Map parts = basicRequestParts(sb, target.hash, salt); + parts.put("process", UTF8.StringBody(process)); + parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash())))); + parts.put("result", UTF8.StringBody(result)); + parts.put("reason", UTF8.StringBody(reason)); + parts.put("wordh", UTF8.StringBody(wordhashes)); + final String lurlstr; + if (entry == null) { + lurlstr = ""; + } else { + final ArrayList ldesc = entry.getDescription(); + if (ldesc.isEmpty()) { + lurlstr = entry.toString(); + } else { // add document abstract/description as snippet (remotely stored in description_txt) + lurlstr = entry.toString(ldesc.get(0)); + } + } + parts.put("lurlEntry", UTF8.StringBody(crypt.simpleEncode(lurlstr, salt))); + // send request + final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000); + MultiProtocolURL targetBaseURL = target.getPublicMultiprotocolURL(ip, preferHttps); + byte[] content; + try { + content = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/crawlReceipt.html"), + target.getHexHash() + ".yacyh", parts, false, true); + } catch(final IOException e) { + if(targetBaseURL.isHTTPS()) { + /* Failed using https : retry with http */ + targetBaseURL = target.getPublicMultiprotocolURL(ip, false); + content = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/crawlReceipt.html"), + target.getHexHash() + ".yacyh", parts, false, true); + if(content != null) { + /* Success with http : mark SSL as unavailable on the target peer */ + markSSLUnavailableOnPeer(sb.peers, target, ip, "yacyClient.crawlReceipt"); + } + } else { + throw e; + } + } + return FileUtils.table(content); + } catch (final Exception e ) { + // most probably a network time-out exception + Network.log.warn("yacyClient.crawlReceipt error:" + e.getMessage()); + } + } + return null; } public static AtomicInteger metadataRetrievalRunning = new AtomicInteger(0); @@ -1798,9 +1833,7 @@ public final class Protocol { targetSeed.getHexHash() + ".yacyh", parts, gzipBody, true); if(content != null) { /* Success with http : mark SSL as unavailable on the target peer */ - Network.log.info("yacyClient.transferRWI SSL unavailable on address " + ip); - targetSeed.setFlagSSLAvailable(false); - Switchboard.getSwitchboard().peers.updateConnected(targetSeed); + markSSLUnavailableOnPeer(Switchboard.getSwitchboard().peers, targetSeed, ip, "yacyClient.transferRWI"); } } else { throw e; @@ -1923,29 +1956,42 @@ public final class Protocol { /** * Receive remote peers profile data * + * @param sb a Switchboard instance holding server environment * @param targetSeed * @return profile or null */ - public static Map getProfile(final Seed targetSeed) { + public static Map getProfile(final Switchboard sb, final Seed targetSeed) { // this post a message to the remote message board final String salt = crypt.randomSalt(); + + final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED, + SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT); - for (String ip : targetSeed.getIPs()) { - String address = targetSeed.getPublicAddress(ip); - if ( address == null ) { - break; - } + for (final String ip : targetSeed.getIPs()) { try { final Map parts = - basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt); + basicRequestParts(sb, targetSeed.hash, salt); final HTTPClient httpclient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 15000); - final byte[] content = - httpclient.POSTbytes( - new MultiProtocolURL("http://" + address + "/yacy/profile.html"), - targetSeed.getHexHash() + ".yacyh", - parts, - false, true); + MultiProtocolURL targetBaseURL = targetSeed.getPublicMultiprotocolURL(ip, preferHttps); + byte[] content; + try { + content = httpclient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/profile.html"), + targetSeed.getHexHash() + ".yacyh", parts, false, true); + } catch(final IOException e) { + if(targetBaseURL.isHTTPS()) { + /* Failed with https : retry using http */ + targetBaseURL = targetSeed.getPublicMultiprotocolURL(ip, false); + content = httpclient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/profile.html"), + targetSeed.getHexHash() + ".yacyh", parts, false, true); + if(content != null) { + /* Got something with http : mark peer SSL as unavailable on target peer */ + markSSLUnavailableOnPeer(sb.peers, targetSeed, ip, "yacyClient.getProfile"); + } + } else { + throw e; + } + } return FileUtils.table(content); } catch (final Exception e ) { Network.log.warn("yacyClient.getProfile error:" + e.getMessage()); @@ -2115,4 +2161,26 @@ public final class Protocol { return "?" + sb.toString().substring(1); } + /** + * Mark a SSL/TLS as unavailable on a connected peer and log an information + * level message. Use when http is successful whereas https is not on the target + * peer. All parameters must not be null. + * + * @param seedDB + * the seeds database to update + * @param peer + * the peer to update + * @param address + * the address on peer where http is successful but https fails. + * @param logPrefix + * a prefix to the log message + */ + private static void markSSLUnavailableOnPeer(final SeedDB seedDB, final Seed peer, final String address, + final String logPrefix) { + Network.log.info(logPrefix + " SSL/TLS unavailable on peer '" + peer.getName() + + "' : can be reached using http but not https on address " + address); + peer.setFlagSSLAvailable(false); + seedDB.updateConnected(peer); + } + } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 9d5447e6c..8a005a772 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1,4 +1,4 @@ -// plasmaSwitchboard.java +// Switchboard.java // (C) 2004-2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 2004 on http://yacy.net // @@ -3674,15 +3674,9 @@ public final class Switchboard extends serverSwitch { @Override public void run() { final long t = System.currentTimeMillis(); - final Map response = - Protocol.crawlReceipt( - Switchboard.this.peers.mySeed(), - this.initiatorPeer, - "crawl", - "fill", - "indexed", - this.reference, - ""); + final Map response = Protocol.crawlReceipt(Switchboard.this, + Switchboard.this.peers.mySeed(), this.initiatorPeer, "crawl", "fill", "indexed", this.reference, + ""); if ( response == null ) { Switchboard.this.log.info("Sending crawl receipt for '" + this.reference.url().toNormalform(true)