Added optional https support for remote crawl and profile operations

pull/154/head
luccioman 7 years ago
parent 5db1c9155a
commit 09c4ee56a7

@ -1,4 +1,4 @@
// ViewProfile_p.java
// ViewProfile.java
// -----------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 07.04.2005 on http://yacy.net
@ -113,7 +113,7 @@ public class ViewProfile {
}
// try to get the profile from remote peer
profile = Protocol.getProfile(seed);
profile = Protocol.getProfile(sb, seed);
// if profile did not arrive, say that peer is disconnected
if (profile == null) {

@ -38,6 +38,7 @@ import net.yacy.peers.DHTSelection;
import net.yacy.peers.Protocol;
import net.yacy.peers.Seed;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -52,7 +53,9 @@ public class rct_p {
if (post.containsKey("retrieve")) {
final String peerhash = post.get("peer", null);
final Seed seed = (peerhash == null) ? null : sb.peers.getConnected(peerhash);
final RSSFeed feed = (seed == null) ? null : Protocol.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000);
final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED,
SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT);
final RSSFeed feed = (seed == null) ? null : Protocol.queryRemoteCrawlURLs(sb.peers, seed, 20, 60000, preferHttps);
if (feed != null) {
for (final Hit item: feed) {
//System.out.println("URL=" + item.getLink() + ", desc=" + item.getDescription() + ", pubDate=" + item.getPubDate());

@ -520,7 +520,9 @@ public class CrawlQueues {
}
// we know a peer which should provide remote crawl entries. load them now.
final RSSFeed feed = Protocol.queryRemoteCrawlURLs(this.sb.peers, seed, 60, 10000);
final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED,
SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT);
final RSSFeed feed = Protocol.queryRemoteCrawlURLs(this.sb.peers, seed, 60, 10000, preferHttps);
if (feed == null || feed.isEmpty()) {
// try again and ask another peer
return remoteCrawlLoaderJob();

@ -1,4 +1,4 @@
// yacyClient.java
// Protocol.java
// -------------------------------------
// (C) by Michael Peter Christen; mc@yacy.net
// first published on http://www.anomic.de
@ -405,7 +405,8 @@ public final class Protocol {
final SeedDB seedDB,
final Seed target,
final int maxCount,
final long maxTime) {
final long maxTime,
final boolean preferHttps) {
// returns a list of
if ( target == null ) {
return null;
@ -430,15 +431,37 @@ public final class Protocol {
// final byte[] result = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + target.getClusterAddress() + "/yacy/urls.xml"), (int) maxTime, target.getHexHash() + ".yacyh", parts);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, (int) maxTime);
RSSReader reader = null;
for (String ip: target.getIPs()) {
for (final String ip: target.getIPs()) {
MultiProtocolURL targetBaseURL = null;
try {
final byte[] result = httpClient.POSTbytes(new MultiProtocolURL("http://" + target.getPublicAddress(ip) + "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true);
targetBaseURL = target.getPublicMultiprotocolURL(ip, preferHttps);
byte[] result;
try {
result = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true);
} catch(final IOException e) {
if(targetBaseURL.isHTTPS()) {
/* Failed with https : retry with http */
targetBaseURL = target.getPublicMultiprotocolURL(ip, false);
result = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/urls.xml"), target.getHexHash() + ".yacyh", parts, false, true);
if(result != null) {
/* Got something with http : mark peer SSL as unavailable on target peer */
markSSLUnavailableOnPeer(seedDB, target, ip, "yacyClient.queryRemoteCrawlURLs");
}
} else {
throw e;
}
}
reader = RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, result);
} catch(MalformedURLException e) {
Network.log.warn("yacyClient.queryRemoteCrawlURLs malformed target URL for peer '" + target.getName()
+ "' on address : " + ip);
} catch (final IOException e ) {
reader = null;
Network.log.warn("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null");
}
if (reader != null) {
break;
}
if (reader != null) break;
Network.log.warn("yacyClient.queryRemoteCrawlURLs failed asking peer '" + target.getName() + "': probably bad response from remote peer (1), reader == null");
target.put(Seed.RCOUNT, "0");
seedDB.peerActions.interfaceDeparture(target, ip);
}
@ -1218,13 +1241,13 @@ public final class Protocol {
/* Thread still running : try also with interrupt*/
remoteRequest.interrupt();
}
Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + target.getPublicAddress(ip) + " does not answer (time-out)");
Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + targetBaseURL + " does not answer (time-out)");
target.setFlagSolrAvailable(false || myseed);
return -1; // give up, leave remoteRequest abandoned.
}
if (rsp[0] == null || docList[0] == null) {
Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + target.getPublicAddress(ip) + " returned null");
Network.log.info("SEARCH failed (solr), remote Peer: " + target.getName() + "/" + targetBaseURL + " returned null");
if(!myseed) {
if(targetBaseURL.startsWith("https")) {
/* First mark https unavailable on this peer before removing anything else */
@ -1515,6 +1538,7 @@ public final class Protocol {
}
public static Map<String, String> crawlReceipt(
final Switchboard sb,
final Seed mySeed,
final Seed target,
final String process,
@ -1548,49 +1572,60 @@ public final class Protocol {
// prepare request
final String salt = crypt.randomSalt();
final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED,
SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT);
// determining target address
final String address = target.getPublicAddress(target.getIP());
if ( address == null ) {
return null;
}
// send request
try {
// prepare request
final Map<String, ContentBody> parts = basicRequestParts(Switchboard.getSwitchboard(), target.hash, salt);
parts.put("process", UTF8.StringBody(process));
parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash()))));
parts.put("result", UTF8.StringBody(result));
parts.put("reason", UTF8.StringBody(reason));
parts.put("wordh", UTF8.StringBody(wordhashes));
final String lurlstr;
if (entry == null) {
lurlstr = "";
} else {
final ArrayList<String> ldesc = entry.getDescription();
if (ldesc.isEmpty()) {
lurlstr = entry.toString();
} else { // add document abstract/description as snippet (remotely stored in description_txt)
lurlstr = entry.toString(ldesc.get(0));
}
}
parts.put("lurlEntry", UTF8.StringBody(crypt.simpleEncode(lurlstr, salt)));
// send request
// final byte[] content = HTTPConnector.getConnector(MultiProtocolURI.yacybotUserAgent).post(new MultiProtocolURI("http://" + address + "/yacy/crawlReceipt.html"), 10000, target.getHexHash() + ".yacyh", parts);
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000);
final byte[] content =
httpClient.POSTbytes(
new MultiProtocolURL("http://" + address + "/yacy/crawlReceipt.html"),
target.getHexHash() + ".yacyh",
parts,
false, true);
return FileUtils.table(content);
} catch (final Exception e ) {
// most probably a network time-out exception
Network.log.warn("yacyClient.crawlReceipt error:" + e.getMessage());
return null;
}
for (final String ip : target.getIPs()) {
// send request
try {
// prepare request
final Map<String, ContentBody> parts = basicRequestParts(sb, target.hash, salt);
parts.put("process", UTF8.StringBody(process));
parts.put("urlhash", UTF8.StringBody(((entry == null) ? "" : ASCII.String(entry.hash()))));
parts.put("result", UTF8.StringBody(result));
parts.put("reason", UTF8.StringBody(reason));
parts.put("wordh", UTF8.StringBody(wordhashes));
final String lurlstr;
if (entry == null) {
lurlstr = "";
} else {
final ArrayList<String> ldesc = entry.getDescription();
if (ldesc.isEmpty()) {
lurlstr = entry.toString();
} else { // add document abstract/description as snippet (remotely stored in description_txt)
lurlstr = entry.toString(ldesc.get(0));
}
}
parts.put("lurlEntry", UTF8.StringBody(crypt.simpleEncode(lurlstr, salt)));
// send request
final HTTPClient httpClient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 10000);
MultiProtocolURL targetBaseURL = target.getPublicMultiprotocolURL(ip, preferHttps);
byte[] content;
try {
content = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/crawlReceipt.html"),
target.getHexHash() + ".yacyh", parts, false, true);
} catch(final IOException e) {
if(targetBaseURL.isHTTPS()) {
/* Failed using https : retry with http */
targetBaseURL = target.getPublicMultiprotocolURL(ip, false);
content = httpClient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/crawlReceipt.html"),
target.getHexHash() + ".yacyh", parts, false, true);
if(content != null) {
/* Success with http : mark SSL as unavailable on the target peer */
markSSLUnavailableOnPeer(sb.peers, target, ip, "yacyClient.crawlReceipt");
}
} else {
throw e;
}
}
return FileUtils.table(content);
} catch (final Exception e ) {
// most probably a network time-out exception
Network.log.warn("yacyClient.crawlReceipt error:" + e.getMessage());
}
}
return null;
}
public static AtomicInteger metadataRetrievalRunning = new AtomicInteger(0);
@ -1798,9 +1833,7 @@ public final class Protocol {
targetSeed.getHexHash() + ".yacyh", parts, gzipBody, true);
if(content != null) {
/* Success with http : mark SSL as unavailable on the target peer */
Network.log.info("yacyClient.transferRWI SSL unavailable on address " + ip);
targetSeed.setFlagSSLAvailable(false);
Switchboard.getSwitchboard().peers.updateConnected(targetSeed);
markSSLUnavailableOnPeer(Switchboard.getSwitchboard().peers, targetSeed, ip, "yacyClient.transferRWI");
}
} else {
throw e;
@ -1923,29 +1956,42 @@ public final class Protocol {
/**
* Receive remote peers profile data
*
* @param sb a Switchboard instance holding server environment
* @param targetSeed
* @return profile or null
*/
public static Map<String, String> getProfile(final Seed targetSeed) {
public static Map<String, String> getProfile(final Switchboard sb, final Seed targetSeed) {
// this post a message to the remote message board
final String salt = crypt.randomSalt();
final boolean preferHttps = sb.getConfigBool(SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED,
SwitchboardConstants.NETWORK_PROTOCOL_HTTPS_PREFERRED_DEFAULT);
for (String ip : targetSeed.getIPs()) {
String address = targetSeed.getPublicAddress(ip);
if ( address == null ) {
break;
}
for (final String ip : targetSeed.getIPs()) {
try {
final Map<String, ContentBody> parts =
basicRequestParts(Switchboard.getSwitchboard(), targetSeed.hash, salt);
basicRequestParts(sb, targetSeed.hash, salt);
final HTTPClient httpclient = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, 15000);
final byte[] content =
httpclient.POSTbytes(
new MultiProtocolURL("http://" + address + "/yacy/profile.html"),
targetSeed.getHexHash() + ".yacyh",
parts,
false, true);
MultiProtocolURL targetBaseURL = targetSeed.getPublicMultiprotocolURL(ip, preferHttps);
byte[] content;
try {
content = httpclient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/profile.html"),
targetSeed.getHexHash() + ".yacyh", parts, false, true);
} catch(final IOException e) {
if(targetBaseURL.isHTTPS()) {
/* Failed with https : retry using http */
targetBaseURL = targetSeed.getPublicMultiprotocolURL(ip, false);
content = httpclient.POSTbytes(new MultiProtocolURL(targetBaseURL, "/yacy/profile.html"),
targetSeed.getHexHash() + ".yacyh", parts, false, true);
if(content != null) {
/* Got something with http : mark peer SSL as unavailable on target peer */
markSSLUnavailableOnPeer(sb.peers, targetSeed, ip, "yacyClient.getProfile");
}
} else {
throw e;
}
}
return FileUtils.table(content);
} catch (final Exception e ) {
Network.log.warn("yacyClient.getProfile error:" + e.getMessage());
@ -2115,4 +2161,26 @@ public final class Protocol {
return "?" + sb.toString().substring(1);
}
/**
* Mark a SSL/TLS as unavailable on a connected peer and log an information
* level message. Use when http is successful whereas https is not on the target
* peer. All parameters must not be null.
*
* @param seedDB
* the seeds database to update
* @param peer
* the peer to update
* @param address
* the address on peer where http is successful but https fails.
* @param logPrefix
* a prefix to the log message
*/
private static void markSSLUnavailableOnPeer(final SeedDB seedDB, final Seed peer, final String address,
final String logPrefix) {
Network.log.info(logPrefix + " SSL/TLS unavailable on peer '" + peer.getName()
+ "' : can be reached using http but not https on address " + address);
peer.setFlagSSLAvailable(false);
seedDB.updateConnected(peer);
}
}

@ -1,4 +1,4 @@
// plasmaSwitchboard.java
// Switchboard.java
// (C) 2004-2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 2004 on http://yacy.net
//
@ -3674,15 +3674,9 @@ public final class Switchboard extends serverSwitch {
@Override
public void run() {
final long t = System.currentTimeMillis();
final Map<String, String> response =
Protocol.crawlReceipt(
Switchboard.this.peers.mySeed(),
this.initiatorPeer,
"crawl",
"fill",
"indexed",
this.reference,
"");
final Map<String, String> response = Protocol.crawlReceipt(Switchboard.this,
Switchboard.this.peers.mySeed(), this.initiatorPeer, "crawl", "fill", "indexed", this.reference,
"");
if ( response == null ) {
Switchboard.this.log.info("Sending crawl receipt for '"
+ this.reference.url().toNormalform(true)

Loading…
Cancel
Save