Make gRPC server and lifecycle part of worker interface

Also addresses some issues around TLS and initialization. The service
is started only if gRPC addresses are provided during startup.
This commit is contained in:
Vishesh Yadav
2025-10-23 12:26:02 -07:00
parent ccd56bf7db
commit 72d2e96b24
13 changed files with 96 additions and 23 deletions

5
.gitignore vendored
View File

@@ -57,6 +57,7 @@ bindings/java/.classstamp*
bindings/java/classes*/
bindings/java/javadoc*/
packaging/docker/website
loopback-cluster/
# Testing and logging
packaging/msi/*.log
@@ -109,3 +110,7 @@ temp/
/.ccls-cache
.clangd/
.stignore
# AI assistants
*.aider*
*.fdq

View File

@@ -37,7 +37,7 @@
IPAddress ClusterConnectionString::determineLocalSourceIP() const {
int size = coords.size() + hostnames.size();
int index = 0;
loop {
while (true) {
try {
using namespace boost::asio;

View File

@@ -22,6 +22,7 @@
#define FDBCLIENT_CLIENTWORKERINTERFACE_H
#pragma once
#include "fdbrpc/FlowGrpc.h"
#include "fdbclient/FDBTypes.h"
#include "fdbrpc/FailureMonitor.h"
#include "fdbclient/Status.h"
@@ -35,17 +36,24 @@ struct ClientWorkerInterface {
RequestStream<struct RebootRequest> reboot;
RequestStream<struct ProfilerRequest> profiler;
RequestStream<struct SetFailureInjection> setFailureInjection;
Optional<NetworkAddress> grpcAddress;
bool operator==(ClientWorkerInterface const& r) const { return id() == r.id(); }
bool operator!=(ClientWorkerInterface const& r) const { return id() != r.id(); }
UID id() const { return reboot.getEndpoint().token; }
NetworkAddress address() const { return reboot.getEndpoint().getPrimaryAddress(); }
void initEndpoints() { reboot.getEndpoint(TaskPriority::ReadSocket); }
void initEndpoints() {
reboot.getEndpoint(TaskPriority::ReadSocket);
auto grpcInstance = FlowGrpc::instance();
if (grpcInstance) {
grpcAddress = grpcInstance->server()->getAddress();
}
}
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, reboot, profiler, setFailureInjection);
serializer(ar, reboot, profiler, setFailureInjection, grpcAddress);
}
};

View File

@@ -28,10 +28,11 @@
// ---- FlowGrpc definitions ------
void FlowGrpc::init(TLSConfig* tls_config, const std::optional<NetworkAddress>& server_addr) {
TraceEvent("FlowGrpcInit");
FlowGrpc* fg = new FlowGrpc();
g_network->setGlobal(INetwork::enGrpcState, (flowGlobalType)fg);
if (!tls_config) {
if (!tls_config || tls_config->isInsecure()) {
fg->credentials_ = std::make_shared<GrpcInsecureCredentialProvider>();
} else {
fg->credentials_ = std::make_shared<GrpcTlsCredentialProvider>(tls_config);
@@ -81,7 +82,7 @@ Future<Void> GrpcServer::run() {
Future<Void> GrpcServer::runInternal() {
ASSERT(state_ == State::Stopped);
ASSERT(server_ == nullptr);
ASSERT(g_network->isOnMainThread());
// ASSERT(g_network->isOnMainThread());
Future<Void> next = Void();
loop {

View File

@@ -114,7 +114,14 @@ public:
~GrpcServer();
// Returns the singleton instance.
static std::shared_ptr<GrpcServer> instance() { return FlowGrpc::instance()->server(); }
static std::shared_ptr<GrpcServer> instance() {
auto p = FlowGrpc::instance();
if (p == nullptr) {
return nullptr;
}
return p->server();
}
// Returns the gRPC server address. Currently, we only listen on single port globally.
NetworkAddress getAddress() const { return address_; }

View File

@@ -407,16 +407,28 @@ struct ProcessData {
LocalityData locality;
ProcessClass processClass;
NetworkAddress address;
Optional<NetworkAddress> grpcAddress;
ProcessData() {}
ProcessData(LocalityData locality, ProcessClass processClass, NetworkAddress address)
: locality(locality), processClass(processClass), address(address) {}
ProcessData(LocalityData locality,
ProcessClass processClass,
NetworkAddress address,
Optional<NetworkAddress> grpcAddress)
: locality(locality), processClass(processClass), address(address), grpcAddress(grpcAddress) {}
// To change this serialization, ProtocolVersion::WorkerListValue must be updated, and downgrades need to be
// considered
template <class Ar>
void serialize(Ar& ar) {
serializer(ar, locality, processClass, address);
if constexpr (!is_fb_function<Ar>) {
if (ar.protocolVersion().hasGrpcEndpoint()) {
serializer(ar, grpcAddress);
}
} else {
serializer(ar, grpcAddress);
}
}
struct sort_by_address {

View File

@@ -802,8 +802,9 @@ ACTOR Future<Void> workerAvailabilityWatch(WorkerInterface worker,
state Future<Void> failed = (worker.address() == g_network->getLocalAddress())
? Never()
: waitFailureClient(worker.waitFailure, SERVER_KNOBS->WORKER_FAILURE_TIME);
cluster->updateWorkerList.set(worker.locality.processId(),
ProcessData(worker.locality, startingClass, worker.stableAddress()));
cluster->updateWorkerList.set(
worker.locality.processId(),
ProcessData(worker.locality, startingClass, worker.stableAddress(), worker.grpcAddress()));
// This switching avoids a race where the worker can be added to id_worker map after the workerAvailabilityWatch
// fails for the worker.
wait(delay(0));

View File

@@ -1215,10 +1215,6 @@ struct CLIOptions {
flushAndExit(FDB_EXIT_ERROR);
}
for (auto& s : grpcAddressStrs) {
fmt::printf("gRPC Endpoint: %s\n", s);
}
if (role == ServerRole::ConsistencyCheck || role == ServerRole::ConsistencyCheckUrgent) {
if (!publicAddressStrs.empty()) {
fprintf(stderr, "ERROR: Public address cannot be specified for consistency check processes\n");
@@ -2511,6 +2507,13 @@ int main(int argc, char* argv[]) {
dataFolder = format("fdb/%d/", opts.publicAddresses.address.port); // SOMEDAY: Better default
std::vector<Future<Void>> actors(listenErrors.begin(), listenErrors.end());
#ifdef FLOW_GRPC_ENABLED
if (opts.grpcAddressStrs.size() > 0) {
FlowGrpc::init(&opts.tlsConfig, NetworkAddress::parse(opts.grpcAddressStrs[0]));
actors.push_back(GrpcServer::instance()->run());
}
#endif
actors.push_back(fdbd(opts.connectionFile,
opts.localities,
opts.processClass,
@@ -2528,12 +2531,6 @@ int main(int argc, char* argv[]) {
actors.push_back(histogramReport());
actors.push_back(metricsReport());
#ifdef FLOW_GRPC_ENABLED
if (opts.grpcAddressStrs.size() > 0) {
FlowGrpc::init(&opts.tlsConfig, NetworkAddress::parse(opts.grpcAddressStrs[0]));
actors.push_back(GrpcServer::instance()->run());
}
#endif
f = stopAfter(waitForAll(actors));
g_network->run();
}

View File

@@ -79,6 +79,7 @@ struct WorkerInterface {
NetworkAddress stableAddress() const { return tLog.getEndpoint().getStableAddress(); }
Optional<NetworkAddress> secondaryAddress() const { return tLog.getEndpoint().addresses.secondaryAddress; }
NetworkAddressList addresses() const { return tLog.getEndpoint().addresses; }
Optional<NetworkAddress> grpcAddress() const { return clientInterface.grpcAddress; }
WorkerInterface() {}
WorkerInterface(const LocalityData& locality) : locality(locality) {}

View File

@@ -40,6 +40,7 @@
#include "flow/IRandom.h"
#include "flow/Knobs.h"
#include "flow/NetworkAddress.h"
#include "fdbrpc/FlowGrpc.h"
#include "flow/ObjectSerializer.h"
#include "flow/Platform.h"
#include "flow/ProtocolVersion.h"
@@ -74,6 +75,8 @@
#include "flow/serialize.h"
#include "flow/ChaosMetrics.h"
#include "fdbrpc/SimulatorProcessInfo.h"
#include "fdbclient/ThreadSafeTransaction.h"
#include "flow/ApiVersion.h"
#ifdef __linux__
#include <fcntl.h>
@@ -282,6 +285,15 @@ Future<Void> handleIOErrors(Future<Void> actor, IClosable* store, UID id, Future
return handleIOErrors(actor, storeError, id, onClosed);
}
Future<Void> deregisterGrpcService(const UID& id) {
#ifdef FLOW_GRPC_ENABLED
if (GrpcServer::instance() != nullptr) {
return GrpcServer::instance()->deregisterRoleServices(id);
}
#endif
return Void();
}
ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
loop choose {
when(ErrorInfo _err = waitNext(errors)) {
@@ -298,12 +310,18 @@ ACTOR Future<Void> workerHandleErrors(FutureStream<ErrorInfo> errors) {
endRole(err.role, err.id, "Error", ok, err.error);
state std::optional<Error> rethrow = std::nullopt;
if (err.error.code() == error_code_please_reboot ||
(err.role == Role::SHARED_TRANSACTION_LOG &&
(err.error.code() == error_code_io_error || err.error.code() == error_code_io_timeout)) ||
(SERVER_KNOBS->STORAGE_SERVER_REBOOT_ON_IO_TIMEOUT && err.role == Role::STORAGE_SERVER &&
err.error.code() == error_code_io_timeout))
throw err.error;
err.error.code() == error_code_io_timeout)) {
rethrow = err.error;
}
if (rethrow != std::nullopt) {
throw *rethrow;
}
}
}
}
@@ -2119,6 +2137,20 @@ bool skipInitRspInSim(const UID workerInterfID, const bool allowDropInSim) {
return skip;
}
ACTOR Future<Void> registerWorkerGrpcServices(UID id, Reference<IClusterConnectionRecord> ccr) {
if (GrpcServer::instance() == nullptr) {
return Never();
}
auto db = Database::createDatabase(ccr, ApiVersion::LATEST_VERSION);
Reference<IDatabase> idb = wait(safeThreadFutureToFuture(ThreadSafeDatabase::createFromExistingDatabase(db)));
auto services = GrpcServer::ServiceList{};
GrpcServer::instance()->registerRoleServices(UID(), services);
TraceEvent("WorkerGrpcServerStart").detail("Address", GrpcServer::instance()->getAddress());
return Never();
}
ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
Reference<AsyncVar<Optional<ClusterControllerFullInterface>> const> ccInterface,
LocalityData locality,
@@ -2193,6 +2225,9 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
// cluster.
state Reference<AsyncVar<bool>> enablePrimaryTxnSystemHealthCheck = makeReference<AsyncVar<bool>>(false);
wait(yield());
state Future<Void> grpc = registerWorkerGrpcServices(interf.id(), connRecord);
if (FLOW_KNOBS->ENABLE_CHAOS_FEATURES) {
TraceEvent(SevInfo, "ChaosFeaturesEnabled");
chaosMetricsActor = chaosMetricsLogger();
@@ -3226,6 +3261,7 @@ ACTOR Future<Void> workerServer(Reference<IClusterConnectionRecord> connRecord,
TraceEvent(SevInfo, "WorkerShutdownComplete", interf.id());
}
wait(deregisterGrpcService(interf.id()));
throw e;
}
}

View File

@@ -94,3 +94,4 @@ set(FDB_PV_BLOB_GRANULE_FILE_LOGICAL_SIZE "0x0FDB00B072000000LL")
set(FDB_PV_BLOB_RANGE_CHANGE_LOG "0x0FDB00B072000000LL")
set(FDB_PV_GC_TXN_GENERATIONS "0x0FDB00B073000000LL")
set(FDB_PV_MUTATION_CHECKSUM "0x0FDB00B074000000LL")
set(FDB_PV_GRPC_ENDPOINT "0x0FDB00B080000000LL")

View File

@@ -212,6 +212,8 @@ public:
bool getDisablePlainTextConnection() const;
bool isInsecure() const { return tlsCertPath.empty() && tlsCertBytes.empty(); }
#ifndef PRIVATE_EXCEPT_FOR_TLSCONFIG_CPP
private:
#endif

View File

@@ -3,6 +3,7 @@ set -euo pipefail
SERVER_COUNT=1
readonly PORT_PREFIX="${PORT_PREFIX:-1500}"
readonly GPORT_PREFIX="${GPORT_PREFIX:-2500}"
# default cluster settings, override with options
STATELESS_COUNT=4
@@ -51,12 +52,13 @@ function start_servers {
exit 1
fi
local port=$(( PORT_PREFIX + SERVER_COUNT ))
local gport=$(( GPORT_PREFIX + SERVER_COUNT ))
local zone="${4}-Z-$(( j % REPLICATION_COUNT ))"
# There may be more than one knob in KNOBS separated by spaces. Bash will quote
# it all with single-quotes because the string has a space in it. We don't want
# that behavior; fdbserver won't be able to parse the quoted knobs. To get around
# this native bash behavior, we printf the KNOBS string.
${2} "${FDB}" -p auto:"${port}" ${KNOBS:+$(printf "%s" "${KNOBS}")} -c "${3}" \
${2} "${FDB}" -p auto:"${port}" -p 127.0.0.1:"${gport}":grpc ${KNOBS:+$(printf "%s" "${KNOBS}")} -c "${3}" \
-d "${datadir}" -L "${logdir}" -C "${CLUSTER}" \
--datacenter_id="${4}" \
--locality-zoneid "${zone}" \