Files
apple-foundationdb/fdbctl/ExcludeCommand.cpp
2026-01-12 11:38:36 -08:00

422 lines
14 KiB
C++

/*
* ExcludeCommand.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2025 Apple Inc. and the FoundationDB project authors
*
I * Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifdef FLOW_GRPC_ENABLED
#include "fdbctl/ControlCommands.h"
#include "fdbclient/IClientApi.h"
#include "fdbclient/ManagementAPI.actor.h"
#include "fdbclient/CoordinationInterface.h"
#include "fdbclient/Schemas.h"
#include "fdbclient/StorageServerInterface.h"
#include "flow/genericactors.actor.h"
#include "fdbctl/ControlCommands.h"
#include "fmt/format.h"
#include <boost/algorithm/string.hpp>
#include <map>
namespace fdbctl {
namespace utils {
Future<std::vector<std::string>> getExcludedServers(Reference<IDatabase> db) {
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
try {
ThreadFuture<RangeResult> resultFuture =
tr->getRange(special_keys::excludedServersSpecialKeyRange, CLIENT_KNOBS->TOO_MANY);
RangeResult r = co_await safeThreadFutureToFuture(resultFuture);
ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY);
std::vector<std::string> exclusions;
for (const auto& i : r) {
auto addr = i.key.removePrefix(special_keys::excludedServersSpecialKeyRange.begin).toString();
exclusions.push_back(addr);
}
co_return exclusions;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
TraceEvent(SevWarn, "GetExcludedServersError").error(e);
err = e;
}
co_await safeThreadFutureToFuture(tr->onError(err));
}
}
Future<std::vector<std::string>> getFailedServers(Reference<IDatabase> db) {
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
try {
ThreadFuture<RangeResult> resultFuture =
tr->getRange(special_keys::failedServersSpecialKeyRange, CLIENT_KNOBS->TOO_MANY);
RangeResult r = co_await safeThreadFutureToFuture(resultFuture);
ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY);
std::vector<std::string> exclusions;
for (const auto& i : r) {
auto addr = i.key.removePrefix(special_keys::failedServersSpecialKeyRange.begin).toString();
exclusions.push_back(addr);
}
co_return exclusions;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
TraceEvent(SevWarn, "GetExcludedServersError").error(e);
err = e;
}
co_await safeThreadFutureToFuture(tr->onError(err));
}
}
Future<std::vector<std::string>> getExcludedLocalities(Reference<IDatabase> db) {
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
try {
ThreadFuture<RangeResult> resultFuture =
tr->getRange(special_keys::excludedLocalitySpecialKeyRange, CLIENT_KNOBS->TOO_MANY);
RangeResult r = co_await safeThreadFutureToFuture(resultFuture);
ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY);
std::vector<std::string> excludedLocalities;
for (const auto& i : r) {
auto locality = i.key.removePrefix(special_keys::excludedLocalitySpecialKeyRange.begin).toString();
excludedLocalities.push_back(locality);
}
co_return excludedLocalities;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
TraceEvent(SevWarn, "GetExcludedLocalitiesError").error(e);
err = e;
}
co_await safeThreadFutureToFuture(tr->onError(err));
}
}
Future<std::set<NetworkAddress>> getInProgressExclusion(Reference<ITransaction> tr) {
ThreadFuture<RangeResult> resultFuture =
tr->getRange(fdbctl::special_keys::exclusionInProgressSpecialKeyRange, CLIENT_KNOBS->TOO_MANY);
RangeResult result = co_await safeThreadFutureToFuture(resultFuture);
ASSERT(!result.more && result.size() < CLIENT_KNOBS->TOO_MANY);
std::set<NetworkAddress> inProgressExclusion;
for (const auto& addr : result) {
inProgressExclusion.insert(NetworkAddress::parse(
addr.key.removePrefix(fdbctl::special_keys::exclusionInProgressSpecialKeyRange.begin).toString()));
}
co_return inProgressExclusion;
}
Future<std::vector<std::string>> getFailedLocalities(Reference<IDatabase> db) {
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
try {
ThreadFuture<RangeResult> resultFuture =
tr->getRange(special_keys::failedLocalitySpecialKeyRange, CLIENT_KNOBS->TOO_MANY);
RangeResult r = co_await safeThreadFutureToFuture(resultFuture);
ASSERT(!r.more && r.size() < CLIENT_KNOBS->TOO_MANY);
std::vector<std::string> excludedLocalities;
for (const auto& i : r) {
auto locality = i.key.removePrefix(special_keys::failedLocalitySpecialKeyRange.begin).toString();
excludedLocalities.push_back(locality);
}
co_return excludedLocalities;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
TraceEvent(SevWarn, "GetFailedLocalitiesError").error(e);
err = e;
}
co_await safeThreadFutureToFuture(tr->onError(err));
}
}
} // namespace utils
Future<Void> excludeServersAndLocalities(Reference<IDatabase> db,
std::vector<AddressExclusion> servers,
std::unordered_set<std::string> localities,
bool failed,
bool force) {
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
try {
if (force && servers.size())
tr->set(failed ? special_keys::failedForceOptionSpecialKey
: special_keys::excludedForceOptionSpecialKey,
ValueRef());
for (const auto& s : servers) {
Key addr = failed ? special_keys::failedServersSpecialKeyRange.begin.withSuffix(s.toString())
: special_keys::excludedServersSpecialKeyRange.begin.withSuffix(s.toString());
tr->set(addr, ValueRef());
}
if (force && localities.size())
tr->set(failed ? special_keys::failedLocalityForceOptionSpecialKey
: special_keys::excludedLocalityForceOptionSpecialKey,
ValueRef());
for (const auto& l : localities) {
Key addr = failed ? special_keys::failedLocalitySpecialKeyRange.begin.withSuffix(l)
: special_keys::excludedLocalitySpecialKeyRange.begin.withSuffix(l);
tr->set(addr, ValueRef());
}
co_await safeThreadFutureToFuture(tr->commit());
co_return;
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw e;
}
TraceEvent(SevWarn, "ExcludeServersAndLocalitiesError").error(e);
err = e;
}
if (err.code() == error_code_special_keys_api_failure) {
std::string errorMsgStr = co_await utils::getSpecialKeysFailureErrorMessage(tr);
// last character is \n
auto pos = errorMsgStr.find_last_of("\n", errorMsgStr.size() - 2);
auto last_line = errorMsgStr.substr(pos + 1);
TraceEvent(SevWarn, "ExcludeServerAndLocalitiesError").error(err).detail("Message", last_line);
throw err;
}
TraceEvent(SevWarn, "ExcludeServersAndLocalitiesError").error(err);
co_await safeThreadFutureToFuture(tr->onError(err));
}
}
Future<std::set<NetworkAddress>> checkForExcludingServers(Reference<IDatabase> db,
std::set<AddressExclusion> exclusions,
bool waitForAllExcluded) {
std::set<NetworkAddress> inProgressExclusion;
Reference<ITransaction> tr = db->createTransaction();
loop {
Error err;
inProgressExclusion.clear();
try {
tr->setOption(FDBTransactionOptions::SPECIAL_KEY_SPACE_ENABLE_WRITES);
tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE);
std::set<NetworkAddress> result = co_await utils::getInProgressExclusion(tr);
if (result.empty())
co_return inProgressExclusion;
inProgressExclusion = result;
// Check if all of the specified exclusions are done.
bool allExcluded = true;
for (const auto& inProgressAddr : inProgressExclusion) {
if (!allExcluded) {
break;
}
for (const auto& exclusion : exclusions) {
// We found an exclusion that is still in progress
if (exclusion.excludes(inProgressAddr)) {
allExcluded = false;
break;
}
}
}
if (allExcluded) {
inProgressExclusion.clear();
co_return inProgressExclusion;
}
if (!waitForAllExcluded)
break;
co_await delayJittered(1.0); // SOMEDAY: watches!
} catch (Error& e) {
if (e.code() == error_code_actor_cancelled) {
throw;
}
TraceEvent(SevWarn, "CheckForExcludingServersError").error(e);
err = e;
}
co_await safeThreadFutureToFuture(tr->onError(err));
}
co_return inProgressExclusion;
}
Future<grpc::Status> exclude(Reference<IDatabase> db, const ExcludeRequest* req, ExcludeReply* rep) {
try {
std::vector<ProcessData> workers;
std::map<std::string, StorageServerInterface> server_interfaces;
Future<bool> future_workers = utils::getWorkersProcessData(db, &workers);
Future<Void> future_server_interfaces = utils::getStorageServerInterfaces(db, &server_interfaces);
co_await success(future_workers);
co_await future_server_interfaces;
bool force = req->force();
bool waitForAllExcluded = !req->no_wait();
bool markFailed = req->failed();
std::set<AddressExclusion> exclusionSet;
std::unordered_set<std::string> exclusionLocalities;
std::vector<std::string> noMatchLocalities;
for (auto& loc : req->localities()) {
ASSERT(loc.starts_with(LocalityData::ExcludeLocalityPrefix.toString()) &&
loc.find(':') != std::string::npos);
exclusionLocalities.insert(loc);
auto localityAddresses = getAddressesByLocality(workers, loc);
auto localityServerAddresses = getServerAddressesByLocality(server_interfaces, loc);
if (localityAddresses.empty() && localityServerAddresses.empty()) {
noMatchLocalities.push_back(loc);
}
if (!localityAddresses.empty()) {
exclusionSet.insert(localityAddresses.begin(), localityAddresses.end());
}
if (!localityServerAddresses.empty()) {
exclusionSet.insert(localityServerAddresses.begin(), localityServerAddresses.end());
}
}
std::vector<AddressExclusion> exclusionAddresses;
for (auto& addr : req->processes()) {
auto a = AddressExclusion::parse(addr);
if (!a.isValid()) {
co_return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT, "");
}
exclusionSet.insert(a);
exclusionAddresses.push_back(a);
}
// The validation if a locality or address has no match is done below and will result in a warning. If we abort
// here the provided locality and/or address will not be excluded.
if (exclusionAddresses.empty() && exclusionLocalities.empty()) {
co_return grpc::Status(grpc::StatusCode::INVALID_ARGUMENT,
"At least one valid network endpoint address or a locality must be provided.");
}
try {
co_await excludeServersAndLocalities(db, exclusionAddresses, exclusionLocalities, markFailed, force);
} catch (Error& e) {
co_return grpc::Status(grpc::StatusCode::INTERNAL, fmt::format("error: ", e.name()));
}
std::set<NetworkAddress> notExcludedServers =
co_await checkForExcludingServers(db, exclusionSet, waitForAllExcluded);
// Determine if data movement is complete
rep->set_data_movement_complete(notExcludedServers.empty());
// Populate the list of excluded addresses
for (const auto& addr : exclusionSet) {
rep->add_excluded_addresses(addr.toString());
}
// Build a map of worker addresses for quick lookup
std::map<IPAddress, std::set<uint16_t>> workerPorts;
for (const auto& addr : workers)
workerPorts[addr.address.ip].insert(addr.address.port);
// Find all excluded addresses that don't have a corresponding worker
std::set<AddressExclusion> absentExclusions;
for (const auto& addr : exclusionSet) {
auto worker = workerPorts.find(addr.ip);
if (worker == workerPorts.end())
absentExclusions.insert(addr);
else if (addr.port > 0 && worker->second.count(addr.port) == 0)
absentExclusions.insert(addr);
}
// Populate absent_addresses field
for (const auto& addr : absentExclusions) {
rep->add_absent_addresses(addr.toString());
}
// Report warnings for localities with no matches
if (!noMatchLocalities.empty()) {
TraceEvent(SevWarn, "ExcludeLocalitiesNoMatch")
.detail("Count", noMatchLocalities.size())
.detail("Localities", boost::algorithm::join(noMatchLocalities, ", "));
}
co_return grpc::Status::OK;
} catch (const Error& e) {
co_return grpc::Status(grpc::StatusCode::INTERNAL,
fmt::format("Error getting worker information: {}", e.what()));
}
}
Future<grpc::Status> excludeStatus(Reference<IDatabase> db, const ExcludeStatusRequest* req, ExcludeStatusReply* rep) {
try {
std::vector<std::string> excludedAddresses = co_await fdbctl::utils::getExcludedServers(db);
std::vector<std::string> excludedLocalities = co_await fdbctl::utils::getExcludedLocalities(db);
std::vector<std::string> failedAddresses = co_await fdbctl::utils::getFailedServers(db);
std::vector<std::string> failedLocalities = co_await fdbctl::utils::getFailedLocalities(db);
for (const auto& e : excludedAddresses) {
rep->add_excluded_addresses(e);
}
for (const auto& e : excludedLocalities) {
rep->add_excluded_localities(e);
}
for (const auto& f : failedAddresses) {
rep->add_failed_addresses(f);
}
for (const auto& f : failedLocalities) {
rep->add_failed_localities(f);
}
Reference<ITransaction> tr = db->createTransaction();
std::set<NetworkAddress> inProgressExclusion = co_await utils::getInProgressExclusion(tr);
for (const auto& addr : inProgressExclusion) {
rep->add_in_progress_excludes(addr.toString());
}
co_return grpc::Status::OK;
} catch (const Error& e) {
co_return grpc::Status(grpc::StatusCode::INTERNAL,
fmt::format("Error getting worker information: {}", e.what()));
}
}
} // namespace fdbctl
#endif