mirror of
https://github.com/apple/foundationdb.git
synced 2026-01-25 04:18:18 +00:00
Make gRPC server and lifecycle part of worker interface
Also addresses some issues around TLS and initialization. The service is started only if gRPC addresses are provided during startup.
This commit is contained in:
5
.gitignore
vendored
5
.gitignore
vendored
@@ -57,6 +57,7 @@ bindings/java/.classstamp*
|
||||
bindings/java/classes*/
|
||||
bindings/java/javadoc*/
|
||||
packaging/docker/website
|
||||
loopback-cluster/
|
||||
|
||||
# Testing and logging
|
||||
packaging/msi/*.log
|
||||
@@ -109,3 +110,7 @@ temp/
|
||||
/.ccls-cache
|
||||
.clangd/
|
||||
.stignore
|
||||
|
||||
# AI assistants
|
||||
*.aider*
|
||||
*.fdq
|
||||
@@ -37,7 +37,7 @@
|
||||
IPAddress ClusterConnectionString::determineLocalSourceIP() const {
|
||||
int size = coords.size() + hostnames.size();
|
||||
int index = 0;
|
||||
loop {
|
||||
while (true) {
|
||||
try {
|
||||
using namespace boost::asio;
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@
|
||||
#define FDBCLIENT_CLIENTWORKERINTERFACE_H
|
||||
#pragma once
|
||||
|
||||
#include "fdbrpc/FlowGrpc.h"
|
||||
#include "fdbclient/FDBTypes.h"
|
||||
#include "fdbrpc/FailureMonitor.h"
|
||||
#include "fdbclient/Status.h"
|
||||
@@ -35,17 +36,24 @@ struct ClientWorkerInterface {
|
||||
RequestStream<struct RebootRequest> reboot;
|
||||
RequestStream<struct ProfilerRequest> profiler;
|
||||
RequestStream<struct SetFailureInjection> setFailureInjection;
|
||||
Optional<NetworkAddress> grpcAddress;
|
||||
|
||||
bool operator==(ClientWorkerInterface const& r) const { return id() == r.id(); }
|
||||
bool operator!=(ClientWorkerInterface const& r) const { return id() != r.id(); }
|
||||
UID id() const { return reboot.getEndpoint().token; }
|
||||
NetworkAddress address() const { return reboot.getEndpoint().getPrimaryAddress(); }
|
||||
|
||||
void initEndpoints() { reboot.getEndpoint(TaskPriority::ReadSocket); }
|
||||
void initEndpoints() {
|
||||
reboot.getEndpoint(TaskPriority::ReadSocket);
|
||||
auto grpcInstance = FlowGrpc::instance();
|
||||
if (grpcInstance) {
|
||||
grpcAddress = grpcInstance->server()->getAddress();
|
||||
}
|
||||
}
|
||||
|
||||
template <class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, reboot, profiler, setFailureInjection);
|
||||
serializer(ar, reboot, profiler, setFailureInjection, grpcAddress);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -28,10 +28,11 @@
|
||||
// ---- FlowGrpc definitions ------
|
||||
|
||||
void FlowGrpc::init(TLSConfig* tls_config, const std::optional<NetworkAddress>& server_addr) {
|
||||
TraceEvent("FlowGrpcInit");
|
||||
FlowGrpc* fg = new FlowGrpc();
|
||||
g_network->setGlobal(INetwork::enGrpcState, (flowGlobalType)fg);
|
||||
|
||||
if (!tls_config) {
|
||||
if (!tls_config || tls_config->isInsecure()) {
|
||||
fg->credentials_ = std::make_shared<GrpcInsecureCredentialProvider>();
|
||||
} else {
|
||||
fg->credentials_ = std::make_shared<GrpcTlsCredentialProvider>(tls_config);
|
||||
@@ -81,7 +82,7 @@ Future<Void> GrpcServer::run() {
|
||||
Future<Void> GrpcServer::runInternal() {
|
||||
ASSERT(state_ == State::Stopped);
|
||||
ASSERT(server_ == nullptr);
|
||||
ASSERT(g_network->isOnMainThread());
|
||||
// ASSERT(g_network->isOnMainThread());
|
||||
|
||||
Future<Void> next = Void();
|
||||
loop {
|
||||
|
||||
@@ -114,7 +114,14 @@ public:
|
||||
~GrpcServer();
|
||||
|
||||
// Returns the singleton instance.
|
||||
static std::shared_ptr<GrpcServer> instance() { return FlowGrpc::instance()->server(); }
|
||||
static std::shared_ptr<GrpcServer> instance() {
|
||||
auto p = FlowGrpc::instance();
|
||||
if (p == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return p->server();
|
||||
}
|
||||
|
||||
// Returns the gRPC server address. Currently, we only listen on single port globally.
|
||||
NetworkAddress getAddress() const { return address_; }
|
||||
|
||||
@@ -407,16 +407,28 @@ struct ProcessData {
|
||||
LocalityData locality;
|
||||
ProcessClass processClass;
|
||||
NetworkAddress address;
|
||||
Optional<NetworkAddress> grpcAddress;
|
||||
|
||||
ProcessData() {}
|
||||
ProcessData(LocalityData locality, ProcessClass processClass, NetworkAddress address)
|
||||
: locality(locality), processClass(processClass), address(address) {}
|
||||
ProcessData(LocalityData locality,
|
||||
ProcessClass processClass,
|
||||
NetworkAddress address,
|
||||
Optional<NetworkAddress> grpcAddress)
|
||||
: locality(locality), processClass(processClass), address(address), grpcAddress(grpcAddress) {}
|
||||
|
||||
// To change this serialization, ProtocolVersion::WorkerListValue must be updated, and downgrades need to be
|
||||
// considered
|
||||
template <class Ar>
|
||||
void serialize(Ar& ar) {
|
||||
serializer(ar, locality, processClass, address);
|
||||
|
||||
if constexpr (!is_fb_function<Ar>) {
|
||||
if (ar.protocolVersion().hasGrpcEndpoint()) {
|
||||
serializer(ar, grpcAddress);
|
||||
}
|
||||
} else {
|
||||
serializer(ar, grpcAddress);
|
||||
}
|
||||
}
|
||||
|
||||
struct sort_by_address {
|
||||
|
||||
@@ -802,8 +802,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
|
||||
state Future<Void> failed = (worker.address() == g_network->getLocalAddress())
|
||||
? Never()
|
||||
: waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
|
||||
cluster->updateWorkerList.set(worker.locality.processId(),
|
||||
ProcessData(worker.locality, startingClass, worker.stableAddress()));
|
||||
cluster->updateWorkerList.set(
|
||||
worker.locality.processId(),
|
||||
ProcessData(worker.locality, startingClass, worker.stableAddress(), worker.grpcAddress()));
|
||||
// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch
|
||||
// fails for the worker.
|
||||
wait(delay(0));
|
||||
|
||||
@@ -1215,10 +1215,6 @@ struct CLIOptions {
|
||||
flushAndExit(FDB_EXIT_ERROR);
|
||||
}
|
||||
|
||||
for (auto& s : grpcAddressStrs) {
|
||||
fmt::printf("gRPC Endpoint: %s\n", s);
|
||||
}
|
||||
|
||||
if (role == ServerRole::ConsistencyCheck || role == ServerRole::ConsistencyCheckUrgent) {
|
||||
if (!publicAddressStrs.empty()) {
|
||||
fprintf(stderr, "ERROR: Public address cannot be specified for consistency check processes\n");
|
||||
@@ -2511,6 +2507,13 @@ int main(int argc, char* argv[]) {
|
||||
dataFolder = format("fdb/%d/", opts.publicAddresses.address.port); // SOMEDAY: Better default
|
||||
|
||||
std::vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
|
||||
|
||||
#ifdef FLOW_GRPC_ENABLED
|
||||
if (opts.grpcAddressStrs.size() > 0) {
|
||||
FlowGrpc::init(&opts.tlsConfig, NetworkAddress::parse(opts.grpcAddressStrs[0]));
|
||||
actors.push_back(GrpcServer::instance()->run());
|
||||
}
|
||||
#endif
|
||||
actors.push_back(fdbd(opts.connectionFile,
|
||||
opts.localities,
|
||||
opts.processClass,
|
||||
@@ -2528,12 +2531,6 @@ int main(int argc, char* argv[]) {
|
||||
actors.push_back(histogramReport());
|
||||
actors.push_back(metricsReport());
|
||||
|
||||
#ifdef FLOW_GRPC_ENABLED
|
||||
if (opts.grpcAddressStrs.size() > 0) {
|
||||
FlowGrpc::init(&opts.tlsConfig, NetworkAddress::parse(opts.grpcAddressStrs[0]));
|
||||
actors.push_back(GrpcServer::instance()->run());
|
||||
}
|
||||
#endif
|
||||
f = stopAfter(waitForAll(actors));
|
||||
g_network->run();
|
||||
}
|
||||
|
||||
@@ -79,6 +79,7 @@ struct WorkerInterface {
|
||||
NetworkAddress stableAddress() const { return tLog.getEndpoint().getStableAddress(); }
|
||||
Optional<NetworkAddress> secondaryAddress() const { return tLog.getEndpoint().addresses.secondaryAddress; }
|
||||
NetworkAddressList addresses() const { return tLog.getEndpoint().addresses; }
|
||||
Optional<NetworkAddress> grpcAddress() const { return clientInterface.grpcAddress; }
|
||||
|
||||
WorkerInterface() {}
|
||||
WorkerInterface(const LocalityData& locality) : locality(locality) {}
|
||||
|
||||
@@ -40,6 +40,7 @@
|
||||
#include "flow/IRandom.h"
|
||||
#include "flow/Knobs.h"
|
||||
#include "flow/NetworkAddress.h"
|
||||
#include "fdbrpc/FlowGrpc.h"
|
||||
#include "flow/ObjectSerializer.h"
|
||||
#include "flow/Platform.h"
|
||||
#include "flow/ProtocolVersion.h"
|
||||
@@ -74,6 +75,8 @@
|
||||
#include "flow/serialize.h"
|
||||
#include "flow/ChaosMetrics.h"
|
||||
#include "fdbrpc/SimulatorProcessInfo.h"
|
||||
#include "fdbclient/ThreadSafeTransaction.h"
|
||||
#include "flow/ApiVersion.h"
|
||||
|
||||
#ifdef __linux__
|
||||
#include <fcntl.h>
|
||||
@@ -282,6 +285,15 @@ Future<Void> handleIOErrors(Future<Void> actor, IClosable* store, UID id, Future
|
||||
return handleIOErrors(actor, storeError, id, onClosed);
|
||||
}
|
||||
|
||||
Future<Void> deregisterGrpcService(const UID& id) {
|
||||
#ifdef FLOW_GRPC_ENABLED
|
||||
if (GrpcServer::instance() != nullptr) {
|
||||
return GrpcServer::instance()->deregisterRoleServices(id);
|
||||
}
|
||||
#endif
|
||||
return Void();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
|
||||
loop choose {
|
||||
when(ErrorInfo _err = waitNext(errors)) {
|
||||
@@ -298,12 +310,18 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
|
||||
|
||||
endRole(err.role, err.id, "Error", ok, err.error);
|
||||
|
||||
state std::optional<Error> rethrow = std::nullopt;
|
||||
if (err.error.code() == error_code_please_reboot ||
|
||||
(err.role == Role::SHARED_TRANSACTION_LOG &&
|
||||
(err.error.code() == error_code_io_error || err.error.code() == error_code_io_timeout)) ||
|
||||
(SERVER_KNOBS->STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT && err.role == Role::STORAGE_SERVER &&
|
||||
err.error.code() == error_code_io_timeout))
|
||||
throw err.error;
|
||||
err.error.code() == error_code_io_timeout)) {
|
||||
rethrow = err.error;
|
||||
}
|
||||
|
||||
if (rethrow != std::nullopt) {
|
||||
throw *rethrow;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2119,6 +2137,20 @@ bool skipInitRspInSim(const UID workerInterfID, const bool allowDropInSim) {
|
||||
return skip;
|
||||
}
|
||||
|
||||
ACTOR Future<Void> registerWorkerGrpcServices(UID id, Reference<IClusterConnectionRecord> ccr) {
|
||||
if (GrpcServer::instance() == nullptr) {
|
||||
return Never();
|
||||
}
|
||||
|
||||
auto db = Database::createDatabase(ccr, ApiVersion::LATEST_VERSION);
|
||||
Reference<IDatabase> idb = wait(safeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(db)));
|
||||
|
||||
auto services = GrpcServer::ServiceList{};
|
||||
GrpcServer::instance()->registerRoleServices(UID(), services);
|
||||
TraceEvent("WorkerGrpcServerStart").detail("Address", GrpcServer::instance()->getAddress());
|
||||
return Never();
|
||||
}
|
||||
|
||||
ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
|
||||
LocalityData locality,
|
||||
@@ -2193,6 +2225,9 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||
// cluster.
|
||||
state Reference<AsyncVar<bool>> enablePrimaryTxnSystemHealthCheck = makeReference<AsyncVar<bool>>(false);
|
||||
|
||||
wait(yield());
|
||||
state Future<Void> grpc = registerWorkerGrpcServices(interf.id(), connRecord);
|
||||
|
||||
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
|
||||
TraceEvent(SevInfo, "ChaosFeaturesEnabled");
|
||||
chaosMetricsActor = chaosMetricsLogger();
|
||||
@@ -3226,6 +3261,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
|
||||
TraceEvent(SevInfo, "WorkerShutdownComplete", interf.id());
|
||||
}
|
||||
|
||||
wait(deregisterGrpcService(interf.id()));
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -94,3 +94,4 @@ set(FDB_PV_BLOB_GRANULE_FILE_LOGICAL_SIZE "0x0FDB00B072000000LL")
|
||||
set(FDB_PV_BLOB_RANGE_CHANGE_LOG "0x0FDB00B072000000LL")
|
||||
set(FDB_PV_GC_TXN_GENERATIONS "0x0FDB00B073000000LL")
|
||||
set(FDB_PV_MUTATION_CHECKSUM "0x0FDB00B074000000LL")
|
||||
set(FDB_PV_GRPC_ENDPOINT "0x0FDB00B080000000LL")
|
||||
|
||||
@@ -212,6 +212,8 @@ public:
|
||||
|
||||
bool getDisablePlainTextConnection() const;
|
||||
|
||||
bool isInsecure() const { return tlsCertPath.empty() && tlsCertBytes.empty(); }
|
||||
|
||||
#ifndef PRIVATE_EXCEPT_FOR_TLSCONFIG_CPP
|
||||
private:
|
||||
#endif
|
||||
|
||||
@@ -3,6 +3,7 @@ set -euo pipefail
|
||||
|
||||
SERVER_COUNT=1
|
||||
readonly PORT_PREFIX="${PORT_PREFIX:-1500}"
|
||||
readonly GPORT_PREFIX="${GPORT_PREFIX:-2500}"
|
||||
|
||||
# default cluster settings, override with options
|
||||
STATELESS_COUNT=4
|
||||
@@ -51,12 +52,13 @@ function start_servers {
|
||||
exit 1
|
||||
fi
|
||||
local port=$(( PORT_PREFIX + SERVER_COUNT ))
|
||||
local gport=$(( GPORT_PREFIX + SERVER_COUNT ))
|
||||
local zone="${4}-Z-$(( j % REPLICATION_COUNT ))"
|
||||
# There may be more than one knob in KNOBS separated by spaces. Bash will quote
|
||||
# it all with single-quotes because the string has a space in it. We don't want
|
||||
# that behavior; fdbserver won't be able to parse the quoted knobs. To get around
|
||||
# this native bash behavior, we printf the KNOBS string.
|
||||
${2} "${FDB}" -p auto:"${port}" ${KNOBS:+$(printf "%s" "${KNOBS}")} -c "${3}" \
|
||||
${2} "${FDB}" -p auto:"${port}" -p 127.0.0.1:"${gport}":grpc ${KNOBS:+$(printf "%s" "${KNOBS}")} -c "${3}" \
|
||||
-d "${datadir}" -L "${logdir}" -C "${CLUSTER}" \
|
||||
--datacenter_id="${4}" \
|
||||
--locality-zoneid "${zone}" \
|
||||
|
||||
Reference in New Issue
Block a user