/* * NativeAPI.actor.cpp * * This source file is part of the FoundationDB open source project * * Copyright 2013-2024 Apple Inc. and the FoundationDB project authors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "fdbclient/NativeAPI.actor.h" #include #include #include #include #include #include #include #include #include #include #include #include "boost/algorithm/string.hpp" #include "fdbclient/Knobs.h" #include "flow/CodeProbe.h" #include "fmt/format.h" #include "fdbclient/FDBOptions.g.h" #include "fdbclient/FDBTypes.h" #include "fdbrpc/FailureMonitor.h" #include "fdbrpc/MultiInterface.h" #include "fdbrpc/TenantInfo.h" #include "fdbclient/ActorLineageProfiler.h" #include "fdbclient/AnnotateActor.h" #include "fdbclient/Atomic.h" #include "fdbclient/BlobGranuleCommon.h" #include "fdbclient/BlobGranuleRequest.actor.h" #include "fdbclient/ClusterInterface.h" #include "fdbclient/ClusterConnectionFile.h" #include "fdbclient/ClusterConnectionMemoryRecord.h" #include "fdbclient/CoordinationInterface.h" #include "fdbclient/CommitTransaction.h" #include "fdbclient/DatabaseContext.h" #include "fdbclient/GlobalConfig.actor.h" #include "fdbclient/IKnobCollection.h" #include "fdbclient/JsonBuilder.h" #include "fdbclient/KeyBackedTypes.actor.h" #include "fdbclient/KeyRangeMap.h" #include "fdbclient/ManagementAPI.actor.h" #include "fdbclient/NameLineage.h" #include "fdbclient/CommitProxyInterface.h" #include "fdbclient/MonitorLeader.h" #include "fdbclient/MutationList.h" #include "fdbclient/ReadYourWrites.h" #include "fdbclient/ParallelStream.actor.h" #include "fdbclient/SpecialKeySpace.actor.h" #include "fdbclient/StorageServerInterface.h" #include "fdbclient/SystemData.h" #include "fdbclient/Tenant.h" #include "fdbclient/TenantSpecialKeys.actor.h" #include "fdbclient/TransactionLineage.h" #include "fdbclient/versions.h" #include "fdbrpc/WellKnownEndpoints.h" #include "fdbrpc/LoadBalance.h" #include "fdbrpc/Net2FileSystem.h" #include "fdbrpc/simulator.h" #include "fdbrpc/sim_validation.h" #include "flow/Arena.h" #include "flow/ActorCollection.h" #include "flow/DeterministicRandom.h" #include "flow/Error.h" #include "flow/FastRef.h" #include "flow/IRandom.h" #include "flow/Trace.h" #include "flow/ProtocolVersion.h" #include "flow/flow.h" #include "flow/genericactors.actor.h" #include "flow/Knobs.h" #include "flow/Platform.h" #include "flow/SystemMonitor.h" #include "flow/TLSConfig.actor.h" #include "fdbclient/Tracing.h" #include "flow/UnitTest.h" #include "flow/network.h" #include "flow/serialize.h" #ifdef ADDRESS_SANITIZER #include #endif #ifdef WIN32 #define WIN32_LEAN_AND_MEAN #include #undef min #undef max #else #include #endif #include "flow/actorcompiler.h" // This must be the last #include. template class RequestStream; template struct NetNotifiedQueue; extern const char* getSourceVersion(); namespace { TransactionLineageCollector transactionLineageCollector; NameLineageCollector nameLineageCollector; template Future loadBalance( DatabaseContext* ctx, const Reference alternatives, RequestStream Interface::*channel, const Request& request = Request(), TaskPriority taskID = TaskPriority::DefaultPromiseEndpoint, AtMostOnce atMostOnce = AtMostOnce::False, // if true, throws request_maybe_delivered() instead of retrying automatically QueueModel* model = nullptr, bool compareReplicas = false, int requiredReplicas = 0) { if (alternatives->hasCaches) { return loadBalance( alternatives->locations(), channel, request, taskID, atMostOnce, model, compareReplicas, requiredReplicas); } return fmap( [ctx](auto const& res) { if (res.cached) { ctx->updateCache.trigger(); } return res; }, loadBalance( alternatives->locations(), channel, request, taskID, atMostOnce, model, compareReplicas, requiredReplicas)); } } // namespace FDB_BOOLEAN_PARAM(TransactionRecordLogInfo); // Whether or not a request should include the tenant name FDB_BOOLEAN_PARAM(UseTenant); // Whether a blob granule request is a request for the mapping to read, or a request to get granule boundaries FDB_BOOLEAN_PARAM(JustGranules); NetworkOptions networkOptions; TLSConfig tlsConfig(TLSEndpointType::CLIENT); // The default values, TRACE_DEFAULT_ROLL_SIZE and TRACE_DEFAULT_MAX_LOGS_SIZE are located in Trace.h. NetworkOptions::NetworkOptions() : traceRollSize(TRACE_DEFAULT_ROLL_SIZE), traceMaxLogsSize(TRACE_DEFAULT_MAX_LOGS_SIZE), traceLogGroup("default"), traceFormat("xml"), traceClockSource("now"), traceInitializeOnSetup(false), supportedVersions(new ReferencedObject>>()), runLoopProfilingEnabled(false), primaryClient(true) {} static const Key CLIENT_LATENCY_INFO_PREFIX = "client_latency/"_sr; static const Key CLIENT_LATENCY_INFO_CTR_PREFIX = "client_latency_counter/"_sr; void DatabaseContext::addTssMapping(StorageServerInterface const& ssi, StorageServerInterface const& tssi) { auto result = tssMapping.find(ssi.id()); // Update tss endpoint mapping if ss isn't in mapping, or the interface it mapped to changed if (result == tssMapping.end() || result->second.getValue.getEndpoint().token.first() != tssi.getValue.getEndpoint().token.first()) { Reference metrics; if (result == tssMapping.end()) { // new TSS pairing metrics = makeReference(); tssMetrics[tssi.id()] = metrics; tssMapping[ssi.id()] = tssi; } else { ASSERT(result->second.id() == tssi.id()); metrics = tssMetrics[tssi.id()]; result->second = tssi; } // data requests duplicated for load and data comparison queueModel.updateTssEndpoint(ssi.getValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getValue.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKey.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKey.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValues.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValues.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getMappedKeyValues.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getKeyValuesStream.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.changeFeedStream.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.changeFeedStream.getEndpoint(), metrics)); // non-data requests duplicated for load queueModel.updateTssEndpoint(ssi.watchValue.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.watchValue.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.splitMetrics.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.splitMetrics.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getReadHotRanges.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.getRangeSplitPoints.getEndpoint(), metrics)); queueModel.updateTssEndpoint(ssi.overlappingChangeFeeds.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.overlappingChangeFeeds.getEndpoint(), metrics)); // duplicated to ensure feed data cleanup queueModel.updateTssEndpoint(ssi.changeFeedPop.getEndpoint().token.first(), TSSEndpointData(tssi.id(), tssi.changeFeedPop.getEndpoint(), metrics)); } } void DatabaseContext::removeTssMapping(StorageServerInterface const& ssi) { auto result = tssMapping.find(ssi.id()); if (result != tssMapping.end()) { tssMetrics.erase(ssi.id()); tssMapping.erase(result); queueModel.removeTssEndpoint(ssi.getValue.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKey.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValues.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getMappedKeyValues.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getKeyValuesStream.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.watchValue.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.splitMetrics.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getReadHotRanges.getEndpoint().token.first()); queueModel.removeTssEndpoint(ssi.getRangeSplitPoints.getEndpoint().token.first()); } } void DatabaseContext::addSSIdTagMapping(const UID& uid, const Tag& tag) { ssidTagMapping[uid] = tag; } void DatabaseContext::getLatestCommitVersionForSSID(const UID& ssid, Tag& tag, Version& commitVersion) { // initialization tag = invalidTag; commitVersion = invalidVersion; auto iter = ssidTagMapping.find(ssid); if (iter != ssidTagMapping.end()) { tag = iter->second; if (ssVersionVectorCache.hasVersion(tag)) { commitVersion = ssVersionVectorCache.getVersion(tag); } } } void DatabaseContext::getLatestCommitVersion(const StorageServerInterface& ssi, Version readVersion, VersionVector& latestCommitVersion) { latestCommitVersion.clear(); if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { return; } // Error checking (based on the assumption that the read version was not obtained // from the client's grv cache). if (readVersion > ssVersionVectorCache.getMaxVersion()) { TraceEvent(SevError, "ReadVersionExceedsVersionVectorMax") .detail("ReadVersion", readVersion) .detail("VersionVector", ssVersionVectorCache.toString()); if (g_network->isSimulated()) { ASSERT(false); } else { return; // Do not return a stale commit version in production. } } Tag tag = invalidTag; Version commitVersion = invalidVersion; getLatestCommitVersionForSSID(ssi.id(), tag, commitVersion); if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < readVersion) { latestCommitVersion.setVersion(tag, commitVersion); } } void DatabaseContext::getLatestCommitVersions(const Reference& locationInfo, Reference info, VersionVector& latestCommitVersions) { latestCommitVersions.clear(); if (info->readOptions.present() && info->readOptions.get().debugID.present()) { g_traceBatch.addEvent( "TransactionDebug", info->readOptions.get().debugID.get().first(), "NativeAPI.getLatestCommitVersions"); } if (!info->readVersionObtainedFromGrvProxy) { return; } if (ssVersionVectorCache.getMaxVersion() == invalidVersion) { return; } if (info->readVersion() > ssVersionVectorCache.getMaxVersion()) { if (!CLIENT_KNOBS->FORCE_GRV_CACHE_OFF && !info->options.skipGrvCache && info->options.useGrvCache) { return; } else { TraceEvent(SevError, "GetLatestCommitVersions") .detail("ReadVersion", info->readVersion()) .detail("VersionVector", ssVersionVectorCache.toString()); ASSERT(false); } } std::map> versionMap; // order the versions to be returned for (int i = 0; i < locationInfo->locations()->size(); i++) { Tag tag = invalidTag; Version commitVersion = invalidVersion; // latest commit version getLatestCommitVersionForSSID(locationInfo->locations()->getId(i), tag, commitVersion); bool updatedVersionMap = false; if (tag != invalidTag && commitVersion != invalidVersion && commitVersion < info->readVersion()) { updatedVersionMap = true; versionMap[commitVersion].insert(tag); } // Do not log if commitVersion >= readVersion. if (!updatedVersionMap && commitVersion == invalidVersion) { TraceEvent(SevDebug, "CommitVersionNotFoundForSS") .detail("InSSIDMap", tag != invalidTag ? 1 : 0) .detail("Tag", tag) .detail("CommitVersion", commitVersion) .detail("ReadVersion", info->readVersion()) .detail("VersionVector", ssVersionVectorCache.toString()) .setMaxEventLength(11000) .setMaxFieldLength(10000); ++transactionCommitVersionNotFoundForSS; } } // insert the commit versions in the version vector. for (auto& iter : versionMap) { latestCommitVersions.setVersion(iter.second, iter.first); } } void updateCachedReadVersionShared(double t, Version v, DatabaseSharedState* p) { MutexHolder mutex(p->mutexLock); if (v >= p->grvCacheSpace.cachedReadVersion) { //TraceEvent(SevDebug, "CacheReadVersionUpdate") // .detail("Version", v) // .detail("CurTime", t) // .detail("LastVersion", p->grvCacheSpace.cachedReadVersion) // .detail("LastTime", p->grvCacheSpace.lastGrvTime); p->grvCacheSpace.cachedReadVersion = v; if (t > p->grvCacheSpace.lastGrvTime) { p->grvCacheSpace.lastGrvTime = t; } } } void DatabaseContext::updateCachedReadVersion(double t, Version v) { if (sharedStatePtr) { return updateCachedReadVersionShared(t, v, sharedStatePtr); } if (v >= cachedReadVersion) { //TraceEvent(SevDebug, "CachedReadVersionUpdate") // .detail("Version", v) // .detail("GrvStartTime", t) // .detail("LastVersion", cachedReadVersion) // .detail("LastTime", lastGrvTime); cachedReadVersion = v; // Since the time is based on the start of the request, it's possible that we // get a newer version with an older time. // (Request started earlier, but was latest to reach the proxy) // Only update time when strictly increasing (?) if (t > lastGrvTime) { lastGrvTime = t; } } } Version DatabaseContext::getCachedReadVersion() { if (sharedStatePtr) { MutexHolder mutex(sharedStatePtr->mutexLock); return sharedStatePtr->grvCacheSpace.cachedReadVersion; } return cachedReadVersion; } double DatabaseContext::getLastGrvTime() { if (sharedStatePtr) { MutexHolder mutex(sharedStatePtr->mutexLock); return sharedStatePtr->grvCacheSpace.lastGrvTime; } return lastGrvTime; } Reference StorageServerInfo::getInterface(DatabaseContext* cx, StorageServerInterface const& ssi, LocalityData const& locality) { auto it = cx->server_interf.find(ssi.id()); if (it != cx->server_interf.end()) { if (it->second->interf.getValue.getEndpoint().token != ssi.getValue.getEndpoint().token) { if (it->second->interf.locality == ssi.locality) { // FIXME: load balance holds pointers to individual members of the interface, and this assignment will // swap out the object they are // pointing to. This is technically correct, but is very unnatural. We may want to refactor load // balance to take an AsyncVar> so that it is notified when the interface // changes. it->second->interf = ssi; } else { it->second->notifyContextDestroyed(); Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); return loc; } } return Reference::addRef(it->second); } Reference loc(new StorageServerInfo(cx, ssi, locality)); cx->server_interf[ssi.id()] = loc.getPtr(); return loc; } void StorageServerInfo::notifyContextDestroyed() { cx = nullptr; } StorageServerInfo::~StorageServerInfo() { if (cx) { auto it = cx->server_interf.find(interf.id()); if (it != cx->server_interf.end()) cx->server_interf.erase(it); cx = nullptr; } } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i].key) + format(":%d ", val[i].value.size()); return s; } std::string printable(const KeyValueRef& val) { return printable(val.key) + format(":%d ", val.value.size()); } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i]) + " "; return s; } std::string printable(const StringRef& val) { return val.printable(); } std::string printable(const std::string& str) { return StringRef(str).printable(); } std::string printable(const KeyRangeRef& range) { return printable(range.begin) + " - " + printable(range.end); } std::string printable(const VectorRef& val) { std::string s; for (int i = 0; i < val.size(); i++) s = s + printable(val[i]) + " "; return s; } int unhex(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; if (c >= 'A' && c <= 'F') return c - 'A' + 10; UNREACHABLE(); } std::string unprintable(std::string const& val) { std::string s; for (int i = 0; i < val.size(); i++) { char c = val[i]; if (c == '\\') { if (++i == val.size()) ASSERT(false); if (val[i] == '\\') { s += '\\'; } else if (val[i] == 'x') { if (i + 2 >= val.size()) ASSERT(false); s += char((unhex(val[i + 1]) << 4) + unhex(val[i + 2])); i += 2; } else ASSERT(false); } else s += c; } return s; } void DatabaseContext::validateVersion(Version version) const { // Version could be 0 if the INITIALIZE_NEW_DATABASE option is set. In that case, it is illegal to perform any // reads. We throw client_invalid_operation because the caller didn't directly set the version, so the // version_invalid error might be confusing. if (version == 0) { throw client_invalid_operation(); } if (switchable && version < minAcceptableReadVersion) { CODE_PROBE(true, "Attempted to read a version lower than any this client has seen from the current cluster"); throw transaction_too_old(); } ASSERT(version > 0 || version == latestVersion); } void validateOptionValuePresent(Optional value) { if (!value.present()) { throw invalid_option_value(); } } void validateOptionValueNotPresent(Optional value) { if (value.present() && value.get().size() > 0) { throw invalid_option_value(); } } void dumpMutations(const MutationListRef& mutations) { for (auto m = mutations.begin(); m; ++m) { switch (m->type) { case MutationRef::SetValue: printf(" '%s' := '%s'\n", printable(m->param1).c_str(), printable(m->param2).c_str()); break; case MutationRef::AddValue: printf(" '%s' += '%s'", printable(m->param1).c_str(), printable(m->param2).c_str()); break; case MutationRef::ClearRange: printf(" Clear ['%s','%s')\n", printable(m->param1).c_str(), printable(m->param2).c_str()); break; default: printf(" Unknown mutation %d('%s','%s')\n", m->type, printable(m->param1).c_str(), printable(m->param2).c_str()); break; } } } template <> void addref(DatabaseContext* ptr) { ptr->addref(); } template <> void delref(DatabaseContext* ptr) { ptr->delref(); } void traceTSSErrors(const char* name, UID tssId, const std::unordered_map& errorsByCode) { TraceEvent ev(name, tssId); for (auto& it : errorsByCode) { ev.detail("E" + std::to_string(it.first), it.second); } } /* For each request type, this will produce Count {SS,TSS}{Mean,P50,P90,P99} Example: GetValueLatencySSMean */ void traceSSOrTSSPercentiles(TraceEvent& ev, const std::string name, DDSketch& sample) { ev.detail(name + "Mean", sample.mean()); // don't log the larger percentiles unless we actually have enough samples to log the accurate percentile instead of // the largest sample in this window if (sample.getPopulationSize() >= 3) { ev.detail(name + "P50", sample.median()); } if (sample.getPopulationSize() >= 10) { ev.detail(name + "P90", sample.percentile(0.90)); } if (sample.getPopulationSize() >= 100) { ev.detail(name + "P99", sample.percentile(0.99)); } } void traceTSSPercentiles(TraceEvent& ev, const std::string name, DDSketch& ssSample, DDSketch& tssSample) { ASSERT(ssSample.getPopulationSize() == tssSample.getPopulationSize()); ev.detail(name + "Count", ssSample.getPopulationSize()); if (ssSample.getPopulationSize() > 0) { traceSSOrTSSPercentiles(ev, name + "SS", ssSample); traceSSOrTSSPercentiles(ev, name + "TSS", tssSample); } } ACTOR Future tssLogger(DatabaseContext* cx) { state double lastLogged = 0; loop { wait(delay(CLIENT_KNOBS->TSS_METRICS_LOGGING_INTERVAL, TaskPriority::FlushTrace)); // Log each TSS pair separately for (const auto& it : cx->tssMetrics) { if (it.second->detailedMismatches.size()) { cx->tssMismatchStream.send( std::pair>(it.first, it.second->detailedMismatches)); } // Do error histograms as separate event if (it.second->ssErrorsByCode.size()) { traceTSSErrors("TSS_SSErrors", it.first, it.second->ssErrorsByCode); } if (it.second->tssErrorsByCode.size()) { traceTSSErrors("TSS_TSSErrors", it.first, it.second->tssErrorsByCode); } TraceEvent tssEv("TSSClientMetrics", cx->dbId); tssEv.detail("TSSID", it.first) .detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Internal", cx->internal); it.second->cc.logToTraceEvent(tssEv); traceTSSPercentiles(tssEv, "GetValueLatency", it.second->SSgetValueLatency, it.second->TSSgetValueLatency); traceTSSPercentiles( tssEv, "GetKeyValuesLatency", it.second->SSgetKeyValuesLatency, it.second->TSSgetKeyValuesLatency); traceTSSPercentiles(tssEv, "GetKeyLatency", it.second->SSgetKeyLatency, it.second->TSSgetKeyLatency); traceTSSPercentiles(tssEv, "GetMappedKeyValuesLatency", it.second->SSgetMappedKeyValuesLatency, it.second->TSSgetMappedKeyValuesLatency); it.second->clear(); } lastLogged = now(); } } ACTOR Future databaseLogger(DatabaseContext* cx) { state double lastLogged = 0; loop { wait(delay(CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); bool logTraces = !g_network->isSimulated() || BUGGIFY_WITH_PROB(0.01); if (logTraces) { TraceEvent ev("TransactionMetrics", cx->dbId); ev.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Cluster", cx->getConnectionRecord() ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() : "") .detail("Internal", cx->internal); cx->cc.logToTraceEvent(ev); ev.detail("LocationCacheEntryCount", cx->locationCache.size()); ev.detail("MeanLatency", cx->latencies.mean()) .detail("MedianLatency", cx->latencies.median()) .detail("Latency90", cx->latencies.percentile(0.90)) .detail("Latency98", cx->latencies.percentile(0.98)) .detail("MaxLatency", cx->latencies.max()) .detail("MeanRowReadLatency", cx->readLatencies.mean()) .detail("MedianRowReadLatency", cx->readLatencies.median()) .detail("MaxRowReadLatency", cx->readLatencies.max()) .detail("MeanGRVLatency", cx->GRVLatencies.mean()) .detail("MedianGRVLatency", cx->GRVLatencies.median()) .detail("MaxGRVLatency", cx->GRVLatencies.max()) .detail("MeanCommitLatency", cx->commitLatencies.mean()) .detail("MedianCommitLatency", cx->commitLatencies.median()) .detail("MaxCommitLatency", cx->commitLatencies.max()) .detail("MeanMutationsPerCommit", cx->mutationsPerCommit.mean()) .detail("MedianMutationsPerCommit", cx->mutationsPerCommit.median()) .detail("MaxMutationsPerCommit", cx->mutationsPerCommit.max()) .detail("MeanBytesPerCommit", cx->bytesPerCommit.mean()) .detail("MedianBytesPerCommit", cx->bytesPerCommit.median()) .detail("MaxBytesPerCommit", cx->bytesPerCommit.max()) .detail("NumLocalityCacheEntries", cx->locationCache.size()); } if (cx->usedAnyChangeFeeds && logTraces) { TraceEvent feedEv("ChangeFeedClientMetrics", cx->dbId); feedEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Cluster", cx->getConnectionRecord() ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() : "") .detail("Internal", cx->internal); cx->ccFeed.logToTraceEvent(feedEv); } if (cx->anyBGReads && logTraces) { TraceEvent bgReadEv("BlobGranuleReadMetrics", cx->dbId); bgReadEv.detail("Elapsed", (lastLogged == 0) ? 0 : now() - lastLogged) .detail("Cluster", cx->getConnectionRecord() ? cx->getConnectionRecord()->getConnectionString().clusterKeyName().toString() : "") .detail("Internal", cx->internal); // add counters cx->ccBG.logToTraceEvent(bgReadEv); // add latencies bgReadEv.detail("MeanBGLatency", cx->bgLatencies.mean()) .detail("MedianBGLatency", cx->bgLatencies.median()) .detail("MaxBGLatency", cx->bgLatencies.max()) .detail("MeanBGGranulesPerRequest", cx->bgGranulesPerRequest.mean()) .detail("MedianBGGranulesPerRequest", cx->bgGranulesPerRequest.median()) .detail("MaxBGGranulesPerRequest", cx->bgGranulesPerRequest.max()); } cx->latencies.clear(); cx->readLatencies.clear(); cx->GRVLatencies.clear(); cx->commitLatencies.clear(); cx->mutationsPerCommit.clear(); cx->bytesPerCommit.clear(); cx->bgLatencies.clear(); cx->bgGranulesPerRequest.clear(); lastLogged = now(); } } struct TrInfoChunk { ValueRef value; Key key; }; ACTOR static Future transactionInfoCommitActor(Transaction* tr, std::vector* chunks) { state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); state int retryCount = 0; loop { try { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); state Future> vstamp = tr->getVersionstamp(); int64_t numCommitBytes = 0; for (auto& chunk : *chunks) { tr->atomicOp(chunk.key, chunk.value, MutationRef::SetVersionstampedKey); numCommitBytes += chunk.key.size() + chunk.value.size() - 4; // subtract number of bytes of key that denotes version stamp index } tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&numCommitBytes, 8), MutationRef::AddValue); wait(tr->commit()); return Void(); } catch (Error& e) { retryCount++; if (retryCount == 10) throw; wait(tr->onError(e)); } } } ACTOR static Future delExcessClntTxnEntriesActor(Transaction* tr, int64_t clientTxInfoSizeLimit) { state const Key clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); state const Key clientLatencyAtomicCtr = CLIENT_LATENCY_INFO_CTR_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin); TraceEvent(SevInfo, "DelExcessClntTxnEntriesCalled").log(); loop { try { tr->reset(); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr->setOption(FDBTransactionOptions::LOCK_AWARE); Optional ctrValue = wait(tr->get(KeyRef(clientLatencyAtomicCtr), Snapshot::True)); if (!ctrValue.present()) { TraceEvent(SevInfo, "NumClntTxnEntriesNotFound").log(); return Void(); } state int64_t txInfoSize = 0; ASSERT(ctrValue.get().size() == sizeof(int64_t)); memcpy(&txInfoSize, ctrValue.get().begin(), ctrValue.get().size()); if (txInfoSize < clientTxInfoSizeLimit) return Void(); int getRangeByteLimit = (txInfoSize - clientTxInfoSizeLimit) < CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT ? (txInfoSize - clientTxInfoSizeLimit) : CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; GetRangeLimits limit(GetRangeLimits::ROW_LIMIT_UNLIMITED, getRangeByteLimit); RangeResult txEntries = wait(tr->getRange(KeyRangeRef(clientLatencyName, strinc(clientLatencyName)), limit)); state int64_t numBytesToDel = 0; KeyRef endKey; for (auto& kv : txEntries) { endKey = kv.key; numBytesToDel += kv.key.size() + kv.value.size(); if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) break; } if (numBytesToDel) { tr->clear(KeyRangeRef(txEntries[0].key, strinc(endKey))); TraceEvent(SevInfo, "DeletingExcessCntTxnEntries").detail("BytesToBeDeleted", numBytesToDel); int64_t bytesDel = -numBytesToDel; tr->atomicOp(clientLatencyAtomicCtr, StringRef((uint8_t*)&bytesDel, 8), MutationRef::AddValue); wait(tr->commit()); } if (txInfoSize - numBytesToDel <= clientTxInfoSizeLimit) return Void(); } catch (Error& e) { wait(tr->onError(e)); } } } // Delref and addref self to give self a chance to get destroyed. ACTOR static Future refreshTransaction(DatabaseContext* self, Transaction* tr) { *tr = Transaction(); wait(delay(0)); // Give ourselves the chance to get cancelled if self was destroyed *tr = Transaction(Database(Reference::addRef(self))); return Void(); } // The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference // counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a // cyclic reference and hence this actor and Database object will not be destroyed at all. ACTOR static Future clientStatusUpdateActor(DatabaseContext* cx) { state const std::string clientLatencyName = CLIENT_LATENCY_INFO_PREFIX.withPrefix(fdbClientInfoPrefixRange.begin).toString(); state Transaction tr; state std::vector commitQ; state int txBytes = 0; loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. wait(refreshTransaction(cx, &tr)); try { ASSERT(cx->clientStatusUpdater.outStatusQ.empty()); cx->clientStatusUpdater.inStatusQ.swap(cx->clientStatusUpdater.outStatusQ); // Split Transaction Info into chunks state std::vector trChunksQ; for (auto& entry : cx->clientStatusUpdater.outStatusQ) { auto& bw = entry.second; int64_t value_size_limit = BUGGIFY ? deterministicRandom()->randomInt(1e3, CLIENT_KNOBS->VALUE_SIZE_LIMIT) : CLIENT_KNOBS->VALUE_SIZE_LIMIT; int num_chunks = (bw.getLength() + value_size_limit - 1) / value_size_limit; std::string random_id = deterministicRandom()->randomAlphaNumeric(16); std::string user_provided_id = entry.first.size() ? entry.first + "/" : ""; for (int i = 0; i < num_chunks; i++) { TrInfoChunk chunk; BinaryWriter chunkBW(Unversioned()); chunkBW << bigEndian32(i + 1) << bigEndian32(num_chunks); chunk.key = KeyRef(clientLatencyName + std::string(10, '\x00') + "/" + random_id + "/" + chunkBW.toValue().toString() + "/" + user_provided_id + std::string(4, '\x00')); int32_t pos = littleEndian32(clientLatencyName.size()); memcpy(mutateString(chunk.key) + chunk.key.size() - sizeof(int32_t), &pos, sizeof(int32_t)); if (i == num_chunks - 1) { chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), bw.getLength() - (i * value_size_limit)); } else { chunk.value = ValueRef(static_cast(bw.getData()) + (i * value_size_limit), value_size_limit); } trChunksQ.push_back(std::move(chunk)); } } // Commit the chunks splitting into different transactions if needed state int64_t dataSizeLimit = BUGGIFY ? deterministicRandom()->randomInt(200e3, 1.5 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT) : 0.8 * CLIENT_KNOBS->TRANSACTION_SIZE_LIMIT; state std::vector::iterator tracking_iter = trChunksQ.begin(); ASSERT(commitQ.empty() && (txBytes == 0)); loop { state std::vector::iterator iter = tracking_iter; txBytes = 0; commitQ.clear(); try { while (iter != trChunksQ.end()) { if (iter->value.size() + iter->key.size() + txBytes > dataSizeLimit) { wait(transactionInfoCommitActor(&tr, &commitQ)); tracking_iter = iter; commitQ.clear(); txBytes = 0; } commitQ.push_back(*iter); txBytes += iter->value.size() + iter->key.size(); ++iter; } if (!commitQ.empty()) { wait(transactionInfoCommitActor(&tr, &commitQ)); commitQ.clear(); txBytes = 0; } break; } catch (Error& e) { if (e.code() == error_code_transaction_too_large) { dataSizeLimit /= 2; ASSERT(dataSizeLimit >= CLIENT_KNOBS->VALUE_SIZE_LIMIT + CLIENT_KNOBS->KEY_SIZE_LIMIT); } else { TraceEvent(SevWarnAlways, "ClientTrInfoErrorCommit").error(e).detail("TxBytes", txBytes); commitQ.clear(); txBytes = 0; throw; } } } cx->clientStatusUpdater.outStatusQ.clear(); wait(cx->globalConfig->onInitialized()); double sampleRate = cx->globalConfig->get(fdbClientInfoTxnSampleRate, std::numeric_limits::infinity()); double clientSamplingProbability = std::isinf(sampleRate) ? CLIENT_KNOBS->CSI_SAMPLING_PROBABILITY : sampleRate; int64_t sizeLimit = cx->globalConfig->get(fdbClientInfoTxnSizeLimit, -1); int64_t clientTxnInfoSizeLimit = sizeLimit == -1 ? CLIENT_KNOBS->CSI_SIZE_LIMIT : sizeLimit; if (!trChunksQ.empty() && deterministicRandom()->random01() < clientSamplingProbability) wait(delExcessClntTxnEntriesActor(&tr, clientTxnInfoSizeLimit)); wait(delay(CLIENT_KNOBS->CSI_STATUS_DELAY)); } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { throw; } cx->clientStatusUpdater.outStatusQ.clear(); TraceEvent(SevWarnAlways, "UnableToWriteClientStatus").error(e); wait(delay(10.0)); } } } ACTOR Future assertFailure(GrvProxyInterface remote, Future> reply) { try { ErrorOr res = wait(reply); if (!res.isError()) { TraceEvent(SevError, "GotStaleReadVersion") .detail("Remote", remote.getConsistentReadVersion.getEndpoint().addresses.address.toString()) .detail("Provisional", remote.provisional) .detail("ReadVersion", res.get().version); ASSERT_WE_THINK(false); } } catch (Error& e) { if (e.code() == error_code_actor_cancelled) { throw; } // we want this to fail -- so getting here is good, we'll just ignore the error. } return Void(); } Future attemptGRVFromOldProxies(std::vector oldProxies, std::vector newProxies) { auto debugID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addEvent("AttemptGRVFromOldProxyDebug", debugID.first(), "NativeAPI.attemptGRVFromOldProxies.Start"); Span span("NAPI:VerifyCausalReadRisky"_loc); std::vector> replies; replies.reserve(oldProxies.size()); GetReadVersionRequest req( span.context, 1, TransactionPriority::IMMEDIATE, GetReadVersionRequest::FLAG_CAUSAL_READ_RISKY); TraceEvent evt("AttemptGRVFromOldProxies"); evt.detail("NumOldProxies", oldProxies.size()).detail("NumNewProxies", newProxies.size()); auto traceProxies = [&](std::vector& proxies, std::string const& key) { for (int i = 0; i < proxies.size(); ++i) { auto k = key + std::to_string(i); evt.detail(k.c_str(), proxies[i].id()); } }; traceProxies(oldProxies, "OldProxy"s); traceProxies(newProxies, "NewProxy"s); evt.log(); for (auto& i : oldProxies) { req.reply = ReplyPromise(); replies.push_back(assertFailure(i, i.getConsistentReadVersion.tryGetReply(req))); } return waitForAll(replies); } ACTOR static Future monitorClientDBInfoChange(DatabaseContext* cx, Reference const> clientDBInfo, AsyncTrigger* proxiesChangeTrigger) { state std::vector curCommitProxies; state std::vector curGrvProxies; state ActorCollection actors(false); state Future clientDBInfoOnChange = clientDBInfo->onChange(); curCommitProxies = clientDBInfo->get().commitProxies; curGrvProxies = clientDBInfo->get().grvProxies; loop { choose { when(wait(clientDBInfoOnChange)) { clientDBInfoOnChange = clientDBInfo->onChange(); if (clientDBInfo->get().commitProxies != curCommitProxies || clientDBInfo->get().grvProxies != curGrvProxies) { // This condition is a bit complicated. Here we want to verify that we're unable to receive a read // version from a proxy of an old generation after a successful recovery. The conditions are: // 1. We only do this with a configured probability. // 2. If the old set of Grv proxies is empty, there's nothing to do // 3. If the new set of Grv proxies is empty, it means the recovery is not complete. So if an old // Grv proxy still gives out read versions, this would be correct behavior. // 4. If we see a provisional proxy, it means the recovery didn't complete yet, so the same as (3) // applies. if (deterministicRandom()->random01() < cx->verifyCausalReadsProp && !curGrvProxies.empty() && !clientDBInfo->get().grvProxies.empty() && !clientDBInfo->get().grvProxies[0].provisional) { actors.add(attemptGRVFromOldProxies(curGrvProxies, clientDBInfo->get().grvProxies)); } curCommitProxies = clientDBInfo->get().commitProxies; curGrvProxies = clientDBInfo->get().grvProxies; proxiesChangeTrigger->trigger(); } } when(wait(actors.getResult())) { UNSTOPPABLE_ASSERT(false); } } } } void updateLocationCacheWithCaches(DatabaseContext* self, const std::map& removed, const std::map& added) { // TODO: this needs to be more clever in the future auto ranges = self->locationCache.ranges(); for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { if (iter->value() && iter->value()->hasCaches) { auto& val = iter->value(); std::vector>> interfaces; interfaces.reserve(val->size() - removed.size() + added.size()); for (int i = 0; i < val->size(); ++i) { const auto& interf = (*val)[i]; if (removed.count(interf->interf.id()) == 0) { interfaces.emplace_back(interf); } } for (const auto& p : added) { interfaces.push_back(makeReference>(p.second)); } iter->value() = makeReference(interfaces, true); } } } Reference addCaches(const Reference& loc, const std::vector>>& other) { std::vector>> interfaces; interfaces.reserve(loc->size() + other.size()); for (int i = 0; i < loc->size(); ++i) { interfaces.emplace_back((*loc)[i]); } interfaces.insert(interfaces.end(), other.begin(), other.end()); return makeReference(interfaces, true); } ACTOR Future updateCachedRanges(DatabaseContext* self, std::map* cacheServers) { state Transaction tr; state Value trueValue = storageCacheValue(std::vector{ 0 }); state Value falseValue = storageCacheValue(std::vector{}); try { loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. tr = Transaction(); wait(delay(0)); // Give ourselves the chance to get cancelled if self was destroyed wait(brokenPromiseToNever(self->updateCache.onTrigger())); // brokenPromiseToNever because self might get // destroyed elsewhere while we're waiting here. tr = Transaction(Database(Reference::addRef(self))); tr.setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); try { RangeResult range = wait(tr.getRange(storageCacheKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!range.more); std::vector>> cacheInterfaces; cacheInterfaces.reserve(cacheServers->size()); for (const auto& p : *cacheServers) { cacheInterfaces.push_back(makeReference>(p.second)); } bool currCached = false; KeyRef begin, end; for (const auto& kv : range) { // These booleans have to flip consistently ASSERT(currCached == (kv.value == falseValue)); if (kv.value == trueValue) { begin = kv.key.substr(storageCacheKeys.begin.size()); currCached = true; } else { currCached = false; end = kv.key.substr(storageCacheKeys.begin.size()); KeyRangeRef cachedRange{ begin, end }; auto ranges = self->locationCache.containedRanges(cachedRange); KeyRef containedRangesBegin, containedRangesEnd, prevKey; if (!ranges.empty()) { containedRangesBegin = ranges.begin().range().begin; } for (auto iter = ranges.begin(); iter != ranges.end(); ++iter) { containedRangesEnd = iter->range().end; if (iter->value() && !iter->value()->hasCaches) { iter->value() = addCaches(iter->value(), cacheInterfaces); } } auto iter = self->locationCache.rangeContaining(begin); if (iter->value() && !iter->value()->hasCaches) { if (end >= iter->range().end) { Key endCopy = iter->range().end; // Copy because insertion invalidates iterator self->locationCache.insert(KeyRangeRef{ begin, endCopy }, addCaches(iter->value(), cacheInterfaces)); } else { self->locationCache.insert(KeyRangeRef{ begin, end }, addCaches(iter->value(), cacheInterfaces)); } } iter = self->locationCache.rangeContainingKeyBefore(end); if (iter->value() && !iter->value()->hasCaches) { Key beginCopy = iter->range().begin; // Copy because insertion invalidates iterator self->locationCache.insert(KeyRangeRef{ beginCopy, end }, addCaches(iter->value(), cacheInterfaces)); } } } wait(delay(2.0)); // we want to wait at least some small amount of time before // updating this list again } catch (Error& e) { wait(tr.onError(e)); } } } catch (Error& e) { TraceEvent(SevError, "UpdateCachedRangesFailed").error(e); throw; } } // The reason for getting a pointer to DatabaseContext instead of a reference counted object is because reference // counting will increment reference count for DatabaseContext which holds the future of this actor. This creates a // cyclic reference and hence this actor and Database object will not be destroyed at all. ACTOR Future monitorCacheList(DatabaseContext* self) { state Transaction tr; state std::map cacheServerMap; state Future updateRanges = updateCachedRanges(self, &cacheServerMap); state Backoff backoff; // if no caches are configured, we don't want to run this actor at all // so we just wait for the first trigger from a storage server wait(self->updateCache.onTrigger()); try { loop { // Need to make sure that we eventually destroy tr. We can't rely on getting cancelled to do this because of // the cyclic reference to self. wait(refreshTransaction(self, &tr)); try { RangeResult cacheList = wait(tr.getRange(storageCacheServerKeys, CLIENT_KNOBS->TOO_MANY)); ASSERT(!cacheList.more); bool hasChanges = false; std::map allCacheServers; for (auto kv : cacheList) { auto ssi = BinaryReader::fromStringRef(kv.value, IncludeVersion()); allCacheServers.emplace(ssi.id(), ssi); } std::map newCacheServers; std::map deletedCacheServers; std::set_difference(allCacheServers.begin(), allCacheServers.end(), cacheServerMap.begin(), cacheServerMap.end(), std::insert_iterator>( newCacheServers, newCacheServers.begin())); std::set_difference(cacheServerMap.begin(), cacheServerMap.end(), allCacheServers.begin(), allCacheServers.end(), std::insert_iterator>( deletedCacheServers, deletedCacheServers.begin())); hasChanges = !(newCacheServers.empty() && deletedCacheServers.empty()); if (hasChanges) { updateLocationCacheWithCaches(self, deletedCacheServers, newCacheServers); } cacheServerMap = std::move(allCacheServers); wait(delay(5.0)); backoff = Backoff(); } catch (Error& e) { wait(tr.onError(e)); wait(backoff.onError()); } } } catch (Error& e) { TraceEvent(SevError, "MonitorCacheListFailed").error(e); throw; } } ACTOR static Future handleTssMismatches(DatabaseContext* cx) { state Reference tr; state KeyBackedMap tssMapDB = KeyBackedMap(tssMappingKeys.begin); state KeyBackedMap tssMismatchDB = KeyBackedMap(tssMismatchKeys.begin); loop { // state std::pair> data = waitNext(cx->tssMismatchStream.getFuture()); // return to calling actor, don't do this as part of metrics loop wait(delay(0)); // find ss pair id so we can remove it from the mapping state UID tssPairID; bool found = false; for (const auto& it : cx->tssMapping) { if (it.second.id() == data.first) { tssPairID = it.first; found = true; break; } } if (found) { state bool quarantine = CLIENT_KNOBS->QUARANTINE_TSS_ON_MISMATCH; TraceEvent(SevWarnAlways, quarantine ? "TSS_QuarantineMismatch" : "TSS_KillMismatch") .detail("TSSID", data.first.toString()); CODE_PROBE(quarantine, "Quarantining TSS because it got mismatch"); CODE_PROBE(!quarantine, "Killing TSS because it got mismatch"); tr = makeReference(Database(Reference::addRef(cx))); state int tries = 0; loop { try { tr->setOption(FDBTransactionOptions::PRIORITY_SYSTEM_IMMEDIATE); tr->setOption(FDBTransactionOptions::ACCESS_SYSTEM_KEYS); if (quarantine) { tr->set(tssQuarantineKeyFor(data.first), ""_sr); } else { tr->clear(serverTagKeyFor(data.first)); } tssMapDB.erase(tr, tssPairID); for (const DetailedTSSMismatch& d : data.second) { // -> mismatch data tssMismatchDB.set(tr, Tuple::makeTuple(data.first.toString(), d.timestamp, d.mismatchId.toString()), d.traceString); } wait(tr->commit()); break; } catch (Error& e) { wait(tr->onError(e)); } tries++; if (tries > 10) { // Give up, it'll get another mismatch or a human will investigate eventually TraceEvent("TSS_MismatchGaveUp").detail("TSSID", data.first.toString()); break; } } // clear out txn so that the extra DatabaseContext ref gets decref'd and we can free cx tr = makeReference(); } else { CODE_PROBE(true, "Not handling TSS with mismatch because it's already gone"); } } } ACTOR static Future backgroundGrvUpdater(DatabaseContext* cx) { state Transaction tr; state double grvDelay = 0.001; state Backoff backoff; try { loop { if (CLIENT_KNOBS->FORCE_GRV_CACHE_OFF) return Void(); wait(refreshTransaction(cx, &tr)); state double curTime = now(); state double lastTime = cx->getLastGrvTime(); state double lastProxyTime = cx->lastProxyRequestTime; TraceEvent(SevDebug, "BackgroundGrvUpdaterBefore") .detail("CurTime", curTime) .detail("LastTime", lastTime) .detail("GrvDelay", grvDelay) .detail("CachedReadVersion", cx->getCachedReadVersion()) .detail("CachedTime", cx->getLastGrvTime()) .detail("Gap", curTime - lastTime) .detail("Bound", CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay); if (curTime - lastTime >= (CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay) || curTime - lastProxyTime > CLIENT_KNOBS->MAX_PROXY_CONTACT_LAG) { try { tr.setOption(FDBTransactionOptions::SKIP_GRV_CACHE); wait(success(tr.getReadVersion())); cx->lastProxyRequestTime = curTime; grvDelay = (grvDelay + (now() - curTime)) / 2.0; TraceEvent(SevDebug, "BackgroundGrvUpdaterSuccess") .detail("GrvDelay", grvDelay) .detail("CachedReadVersion", cx->getCachedReadVersion()) .detail("CachedTime", cx->getLastGrvTime()); backoff = Backoff(); } catch (Error& e) { TraceEvent(SevInfo, "BackgroundGrvUpdaterTxnError").errorUnsuppressed(e); wait(tr.onError(e)); wait(backoff.onError()); } } else { wait( delay(std::max(0.001, std::min(CLIENT_KNOBS->MAX_PROXY_CONTACT_LAG - (curTime - lastProxyTime), (CLIENT_KNOBS->MAX_VERSION_CACHE_LAG - grvDelay) - (curTime - lastTime))))); } } } catch (Error& e) { TraceEvent(SevInfo, "BackgroundGrvUpdaterFailed").errorUnsuppressed(e); throw; } } inline HealthMetrics populateHealthMetrics(const HealthMetrics& detailedMetrics, bool detailedOutput) { if (detailedOutput) { return detailedMetrics; } else { HealthMetrics result; result.update(detailedMetrics, false, false); return result; } } ACTOR static Future getHealthMetricsActor(DatabaseContext* cx, bool detailed, bool sendDetailedRequest) { loop { choose { when(wait(cx->onProxiesChanged())) {} when(GetHealthMetricsReply rep = wait(basicLoadBalance(cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getHealthMetrics, GetHealthMetricsRequest(sendDetailedRequest)))) { cx->healthMetrics.update(rep.healthMetrics, sendDetailedRequest, true); cx->healthMetricsLastUpdated = now(); if (sendDetailedRequest) { cx->detailedHealthMetricsLastUpdated = now(); } return populateHealthMetrics(cx->healthMetrics, detailed); } } } } Future DatabaseContext::getHealthMetrics(bool detailed = false) { if (now() - healthMetricsLastUpdated < CLIENT_KNOBS->AGGREGATE_HEALTH_METRICS_MAX_STALENESS) { return populateHealthMetrics(healthMetrics, detailed); } bool sendDetailedRequest = detailed && now() - detailedHealthMetricsLastUpdated > CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS; return getHealthMetricsActor(this, detailed, sendDetailedRequest); } Future> DatabaseContext::getStorageStats(const UID& id, double maxStaleness) { if (now() - detailedHealthMetricsLastUpdated < maxStaleness) { auto it = healthMetrics.storageStats.find(id); return it == healthMetrics.storageStats.end() ? Optional() : it->second; } return map(getHealthMetricsActor(this, true, true), [&id](auto metrics) -> Optional { auto it = metrics.storageStats.find(id); return it == metrics.storageStats.end() ? Optional() : it->second; }); } // register a special key(s) implementation under the specified module void DatabaseContext::registerSpecialKeysImpl(SpecialKeySpace::MODULE module, SpecialKeySpace::IMPLTYPE type, std::unique_ptr&& impl, int deprecatedVersion) { // if deprecated, add the implementation when the api version is less than the deprecated version if (deprecatedVersion == -1 || apiVersion.version() < deprecatedVersion) { specialKeySpace->registerKeyRange(module, type, impl->getKeyRange(), impl.get()); specialKeySpaceModules.push_back(std::move(impl)); } } ACTOR Future getWorkerInterfaces(Reference clusterRecord); ACTOR Future> getJSON(Database db, std::string jsonField = ""); struct SingleSpecialKeyImpl : SpecialKeyRangeReadImpl { Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override { ASSERT(kr.contains(k)); return map(f(ryw), [k = k](Optional v) { RangeResult result; if (v.present()) { result.push_back_deep(result.arena(), KeyValueRef(k, v.get())); } return result; }); } SingleSpecialKeyImpl(KeyRef k, const std::function>(ReadYourWritesTransaction*)>& f, bool supportsTenants = false) : SpecialKeyRangeReadImpl(singleKeyRange(k)), k(k), f(f), tenantSupport(supportsTenants) {} bool supportsTenants() const override { CODE_PROBE(tenantSupport, "Single special key in tenant"); return tenantSupport; }; private: Key k; std::function>(ReadYourWritesTransaction*)> f; bool tenantSupport; }; class HealthMetricsRangeImpl : public SpecialKeyRangeAsyncImpl { public: explicit HealthMetricsRangeImpl(KeyRangeRef kr); Future getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const override; }; static RangeResult healthMetricsToKVPairs(const HealthMetrics& metrics, KeyRangeRef kr) { RangeResult result; if (CLIENT_BUGGIFY) return result; if (kr.contains("\xff\xff/metrics/health/aggregate"_sr) && metrics.worstStorageDurabilityLag != 0) { json_spirit::mObject statsObj; statsObj["batch_limited"] = metrics.batchLimited; statsObj["tps_limit"] = metrics.tpsLimit; statsObj["worst_storage_durability_lag"] = metrics.worstStorageDurabilityLag; statsObj["limiting_storage_durability_lag"] = metrics.limitingStorageDurabilityLag; statsObj["worst_storage_queue"] = metrics.worstStorageQueue; statsObj["limiting_storage_queue"] = metrics.limitingStorageQueue; statsObj["worst_log_queue"] = metrics.worstTLogQueue; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef("\xff\xff/metrics/health/aggregate"_sr, bytes)); } // tlog stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, logStats] : metrics.tLogQueue) { StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/log/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } if (phase == 1) { if (k < kr.end) { json_spirit::mObject statsObj; statsObj["log_queue"] = logStats; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef(k, bytes)); } else { break; } } } } // Storage stats { int phase = 0; // Avoid comparing twice per loop iteration for (const auto& [uid, storageStats] : metrics.storageStats) { StringRef k{ StringRef(uid.toString()).withPrefix("\xff\xff/metrics/health/storage/"_sr, result.arena()) }; if (phase == 0 && k >= kr.begin) { phase = 1; } if (phase == 1) { if (k < kr.end) { json_spirit::mObject statsObj; statsObj["storage_durability_lag"] = storageStats.storageDurabilityLag; statsObj["storage_queue"] = storageStats.storageQueue; statsObj["cpu_usage"] = storageStats.cpuUsage; statsObj["disk_usage"] = storageStats.diskUsage; std::string statsString = json_spirit::write_string(json_spirit::mValue(statsObj), json_spirit::Output_options::raw_utf8); ValueRef bytes(result.arena(), statsString); result.push_back(result.arena(), KeyValueRef(k, bytes)); } else { break; } } } } return result; } ACTOR static Future healthMetricsGetRangeActor(ReadYourWritesTransaction* ryw, KeyRangeRef kr) { HealthMetrics metrics = wait(ryw->getDatabase()->getHealthMetrics( /*detailed ("per process")*/ kr.intersects( KeyRangeRef("\xff\xff/metrics/health/storage/"_sr, "\xff\xff/metrics/health/storage0"_sr)) || kr.intersects(KeyRangeRef("\xff\xff/metrics/health/log/"_sr, "\xff\xff/metrics/health/log0"_sr)))); return healthMetricsToKVPairs(metrics, kr); } HealthMetricsRangeImpl::HealthMetricsRangeImpl(KeyRangeRef kr) : SpecialKeyRangeAsyncImpl(kr) {} Future HealthMetricsRangeImpl::getRange(ReadYourWritesTransaction* ryw, KeyRangeRef kr, GetRangeLimits limitsHint) const { return healthMetricsGetRangeActor(ryw, kr); } ACTOR Future getClusterId(Database db) { while (!db->clientInfo->get().clusterId.isValid()) { wait(db->clientInfo->onChange()); } return db->clientInfo->get().clusterId; } void DatabaseContext::initializeSpecialCounters() { specialCounter(cc, "OutstandingWatches", [this] { return outstandingWatches; }); specialCounter(cc, "WatchMapSize", [this] { return watchMap.size(); }); } DatabaseContext::DatabaseContext(Reference>> connectionRecord, Reference> clientInfo, Reference> const> coordinator, Future clientInfoMonitor, TaskPriority taskID, LocalityData const& clientLocality, EnableLocalityLoadBalance enableLocalityLoadBalance, LockAware lockAware, IsInternal internal, int _apiVersion, IsSwitchable switchable, Optional defaultTenant) : dbId(deterministicRandom()->randomUniqueID()), lockAware(lockAware), switchable(switchable), connectionRecord(connectionRecord), proxyProvisional(false), clientLocality(clientLocality), enableLocalityLoadBalance(enableLocalityLoadBalance), defaultTenant(defaultTenant), internal(internal), cc("TransactionMetrics", dbId.toString()), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), transactionBatchReadVersions("BatchPriorityReadVersions", cc), transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), transactionGetRangeRequests("GetRangeRequests", cc), transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), transactionBlobGranuleLocationRequests("BlobGranuleLocationRequests", cc), transactionBlobGranuleLocationRequestsCompleted("BlobGranuleLocationRequestsCompleted", cc), transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsLockRejected("LockRejected", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false), ccBG("BlobGranuleReadMetrics", dbId.toString()), bgReadInputBytes("BGReadInputBytes", ccBG), bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG), bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG), bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics", dbId.toString()), feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), outstandingWatches(0), sharedStatePtr(nullptr), lastGrvTime(0.0), cachedReadVersion(0), lastRkBatchThrottleTime(0.0), lastRkDefaultThrottleTime(0.0), lastProxyRequestTime(0.0), transactionTracingSample(false), taskID(taskID), clientInfo(clientInfo), clientInfoMonitor(clientInfoMonitor), coordinator(coordinator), apiVersion(_apiVersion), mvCacheInsertLocation(0), healthMetricsLastUpdated(0), detailedHealthMetricsLastUpdated(0), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), specialKeySpace(std::make_unique(specialKeys.begin, specialKeys.end, /* test */ false)), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())) { TraceEvent("DatabaseContextCreated", dbId).backtrace(); connected = (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) ? Void() : clientInfo->onChange(); metadataVersionCache.resize(CLIENT_KNOBS->METADATA_VERSION_CACHE_SIZE); maxOutstandingWatches = CLIENT_KNOBS->DEFAULT_MAX_OUTSTANDING_WATCHES; snapshotRywEnabled = apiVersion.hasSnapshotRYW() ? 1 : 0; logger = databaseLogger(this) && tssLogger(this); locationCacheSize = g_network->isSimulated() ? CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE_SIM : CLIENT_KNOBS->LOCATION_CACHE_EVICTION_SIZE; getValueSubmitted.init("NativeAPI.GetValueSubmitted"_sr); getValueCompleted.init("NativeAPI.GetValueCompleted"_sr); clientDBInfoMonitor = monitorClientDBInfoChange(this, clientInfo, &proxiesChangeTrigger); tssMismatchHandler = handleTssMismatches(this); clientStatusUpdater.actor = clientStatusUpdateActor(this); cacheListMonitor = monitorCacheList(this); smoothMidShardSize.reset(CLIENT_KNOBS->INIT_MID_SHARD_BYTES); globalConfig = std::make_unique(this); if (apiVersion.version() >= 740) { registerSpecialKeysImpl( SpecialKeySpace::MODULE::METRICS, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( singleKeyRange("fault_tolerance_metrics_json"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::METRICS).begin))); } if (apiVersion.version() >= 700) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::ERRORMSG, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ERRORMSG).begin, [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getSpecialKeySpaceErrorMsg().present()) return Optional(ryw->getSpecialKeySpaceErrorMsg().get()); else return Optional(); }, true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("options/"_sr, "options0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("exclude"))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("failed"))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( SpecialKeySpace::getManagementApiCommandRange("excludedlocality"))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( SpecialKeySpace::getManagementApiCommandRange("failedlocality"))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef("in_progress_exclusion/"_sr, "in_progress_exclusion0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("process/class_type/"_sr, "process/class_type0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef("process/class_source/"_sr, "process/class_source0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange("db_locked"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange("consistency_check_suspended"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::GLOBALCONFIG, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::GLOBALCONFIG))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::TRACING, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::TRACING))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONFIGURATION, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("coordinators/"_sr, "coordinators0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::CONFIGURATION).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( singleKeyRange("auto_coordinators"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange("min_required_commit_version"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( singleKeyRange("version_epoch"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("profiling/"_sr, "profiling0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin)), /* deprecated */ ApiVersion::withClientProfilingDeprecated().version()); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("maintenance/"_sr, "maintenance0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( KeyRangeRef("data_distribution/"_sr, "data_distribution0"_sr) .withPrefix(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::MANAGEMENT).begin))); registerSpecialKeysImpl( SpecialKeySpace::MODULE::ACTORLINEAGE, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTORLINEAGE))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique( SpecialKeySpace::getModuleRange(SpecialKeySpace::MODULE::ACTOR_PROFILER_CONF))); } if (apiVersion.version() >= 630) { registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(conflictingKeysRange)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(readConflictRangeKeysRange)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::TRANSACTION, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(writeConflictRangeKeysRange)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique(ddStatsRange)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::METRICS, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef("\xff\xff/metrics/health/"_sr, "\xff\xff/metrics/health0"_sr))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::WORKERINTERFACE, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( KeyRangeRef("\xff\xff/worker_interfaces/"_sr, "\xff\xff/worker_interfaces0"_sr))); registerSpecialKeysImpl(SpecialKeySpace::MODULE::STATUSJSON, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( "\xff\xff/status/json"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { ++ryw->getDatabase()->transactionStatusRequests; return getJSON(ryw->getDatabase()); } else { return Optional(); } }, true)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERFILEPATH, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( "\xff\xff/cluster_file_path"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { Optional output = StringRef(ryw->getDatabase()->getConnectionRecord()->getLocation()); return output; } } catch (Error& e) { return e; } return Optional(); }, true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::CONNECTIONSTRING, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( "\xff\xff/connection_string"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr() && ryw->getDatabase()->getConnectionRecord()) { Reference f = ryw->getDatabase()->getConnectionRecord(); Optional output = StringRef(f->getConnectionString().toString()); return output; } } catch (Error& e) { return e; } return Optional(); }, true)); registerSpecialKeysImpl(SpecialKeySpace::MODULE::CLUSTERID, SpecialKeySpace::IMPLTYPE::READONLY, std::make_unique( "\xff\xff/cluster_id"_sr, [](ReadYourWritesTransaction* ryw) -> Future> { try { if (ryw->getDatabase().getPtr()) { return map(getClusterId(ryw->getDatabase()), [](UID id) { return Optional(StringRef(id.toString())); }); } } catch (Error& e) { return e; } return Optional(); }, true)); registerSpecialKeysImpl( SpecialKeySpace::MODULE::MANAGEMENT, SpecialKeySpace::IMPLTYPE::READWRITE, std::make_unique(SpecialKeySpace::getManagementApiCommandRange("tenant"))); } throttleExpirer = recurring([this]() { expireThrottles(); }, CLIENT_KNOBS->TAG_THROTTLE_EXPIRATION_INTERVAL); if (BUGGIFY) { DatabaseContext::debugUseTags = true; } initializeSpecialCounters(); } DatabaseContext::DatabaseContext(const Error& err) : deferredError(err), internal(IsInternal::False), cc("TransactionMetrics"), transactionReadVersions("ReadVersions", cc), transactionReadVersionsThrottled("ReadVersionsThrottled", cc), transactionReadVersionsCompleted("ReadVersionsCompleted", cc), transactionReadVersionBatches("ReadVersionBatches", cc), transactionBatchReadVersions("BatchPriorityReadVersions", cc), transactionDefaultReadVersions("DefaultPriorityReadVersions", cc), transactionImmediateReadVersions("ImmediatePriorityReadVersions", cc), transactionBatchReadVersionsCompleted("BatchPriorityReadVersionsCompleted", cc), transactionDefaultReadVersionsCompleted("DefaultPriorityReadVersionsCompleted", cc), transactionImmediateReadVersionsCompleted("ImmediatePriorityReadVersionsCompleted", cc), transactionLogicalReads("LogicalUncachedReads", cc), transactionPhysicalReads("PhysicalReadRequests", cc), transactionPhysicalReadsCompleted("PhysicalReadRequestsCompleted", cc), transactionGetKeyRequests("GetKeyRequests", cc), transactionGetValueRequests("GetValueRequests", cc), transactionGetRangeRequests("GetRangeRequests", cc), transactionGetMappedRangeRequests("GetMappedRangeRequests", cc), transactionGetRangeStreamRequests("GetRangeStreamRequests", cc), transactionWatchRequests("WatchRequests", cc), transactionGetAddressesForKeyRequests("GetAddressesForKeyRequests", cc), transactionBytesRead("BytesRead", cc), transactionKeysRead("KeysRead", cc), transactionMetadataVersionReads("MetadataVersionReads", cc), transactionCommittedMutations("CommittedMutations", cc), transactionCommittedMutationBytes("CommittedMutationBytes", cc), transactionSetMutations("SetMutations", cc), transactionClearMutations("ClearMutations", cc), transactionAtomicMutations("AtomicMutations", cc), transactionsCommitStarted("CommitStarted", cc), transactionsCommitCompleted("CommitCompleted", cc), transactionKeyServerLocationRequests("KeyServerLocationRequests", cc), transactionKeyServerLocationRequestsCompleted("KeyServerLocationRequestsCompleted", cc), transactionBlobGranuleLocationRequests("BlobGranuleLocationRequests", cc), transactionBlobGranuleLocationRequestsCompleted("BlobGranuleLocationRequestsCompleted", cc), transactionStatusRequests("StatusRequests", cc), transactionTenantLookupRequests("TenantLookupRequests", cc), transactionTenantLookupRequestsCompleted("TenantLookupRequestsCompleted", cc), transactionsTooOld("TooOld", cc), transactionsFutureVersions("FutureVersions", cc), transactionsNotCommitted("NotCommitted", cc), transactionsMaybeCommitted("MaybeCommitted", cc), transactionsResourceConstrained("ResourceConstrained", cc), transactionsProcessBehind("ProcessBehind", cc), transactionsThrottled("Throttled", cc), transactionsLockRejected("LockRejected", cc), transactionsExpensiveClearCostEstCount("ExpensiveClearCostEstCount", cc), transactionGrvFullBatches("NumGrvFullBatches", cc), transactionGrvTimedOutBatches("NumGrvTimedOutBatches", cc), transactionCommitVersionNotFoundForSS("CommitVersionNotFoundForSS", cc), anyBGReads(false), ccBG("BlobGranuleReadMetrics"), bgReadInputBytes("BGReadInputBytes", ccBG), bgReadOutputBytes("BGReadOutputBytes", ccBG), bgReadSnapshotRows("BGReadSnapshotRows", ccBG), bgReadRowsCleared("BGReadRowsCleared", ccBG), bgReadRowsInserted("BGReadRowsInserted", ccBG), bgReadRowsUpdated("BGReadRowsUpdated", ccBG), bgLatencies(), bgGranulesPerRequest(), usedAnyChangeFeeds(false), ccFeed("ChangeFeedClientMetrics"), feedStreamStarts("FeedStreamStarts", ccFeed), feedMergeStreamStarts("FeedMergeStreamStarts", ccFeed), feedErrors("FeedErrors", ccFeed), feedNonRetriableErrors("FeedNonRetriableErrors", ccFeed), feedPops("FeedPops", ccFeed), feedPopsFallback("FeedPopsFallback", ccFeed), latencies(), readLatencies(), commitLatencies(), GRVLatencies(), mutationsPerCommit(), bytesPerCommit(), sharedStatePtr(nullptr), transactionTracingSample(false), smoothMidShardSize(CLIENT_KNOBS->SHARD_STAT_SMOOTH_AMOUNT), connectToDatabaseEventCacheHolder(format("ConnectToDatabase/%s", dbId.toString().c_str())), outstandingWatches(0) { initializeSpecialCounters(); } // Static constructor used by server processes to create a DatabaseContext // For internal (fdbserver) use only Database DatabaseContext::create(Reference> clientInfo, Future clientInfoMonitor, LocalityData clientLocality, EnableLocalityLoadBalance enableLocalityLoadBalance, TaskPriority taskID, LockAware lockAware, int apiVersion, IsSwitchable switchable) { return Database(new DatabaseContext(Reference>>(), clientInfo, makeReference>>(), clientInfoMonitor, taskID, clientLocality, enableLocalityLoadBalance, lockAware, IsInternal::True, apiVersion, switchable)); } DatabaseContext::~DatabaseContext() { cacheListMonitor.cancel(); clientDBInfoMonitor.cancel(); monitorTssInfoChange.cancel(); tssMismatchHandler.cancel(); initializeChangeFeedCache = Void(); storage = nullptr; changeFeedStorageCommitter = Void(); if (grvUpdateHandler.isValid()) { grvUpdateHandler.cancel(); } if (sharedStatePtr) { sharedStatePtr->delRef(sharedStatePtr); } for (auto it = server_interf.begin(); it != server_interf.end(); it = server_interf.erase(it)) it->second->notifyContextDestroyed(); ASSERT_ABORT(server_interf.empty()); locationCache.insert(allKeys, Reference()); for (auto& it : notAtLatestChangeFeeds) { it.second->context = nullptr; } for (auto& it : changeFeedUpdaters) { it.second->context = nullptr; } TraceEvent("DatabaseContextDestructed", dbId).backtrace(); } Optional DatabaseContext::getCachedLocation(const TenantInfo& tenant, const KeyRef& key, Reverse isBackward) { Arena arena; KeyRef resolvedKey = key; if (tenant.hasTenant()) { CODE_PROBE(true, "Database context get cached location with tenant"); resolvedKey = resolvedKey.withPrefix(tenant.prefix.get(), arena); } auto range = isBackward ? locationCache.rangeContainingKeyBefore(resolvedKey) : locationCache.rangeContaining(resolvedKey); if (range->value()) { return KeyRangeLocationInfo(toPrefixRelativeRange(range->range(), tenant.prefix), range->value()); } return Optional(); } bool DatabaseContext::getCachedLocations(const TenantInfo& tenant, const KeyRangeRef& range, std::vector& result, int limit, Reverse reverse) { result.clear(); Arena arena; KeyRangeRef resolvedRange = range; if (tenant.hasTenant()) { CODE_PROBE(true, "Database context get cached locations with tenant"); resolvedRange = resolvedRange.withPrefix(tenant.prefix.get(), arena); } auto begin = locationCache.rangeContaining(resolvedRange.begin); auto end = locationCache.rangeContainingKeyBefore(resolvedRange.end); loop { auto r = reverse ? end : begin; if (!r->value()) { CODE_PROBE(result.size(), "had some but not all cached locations"); result.clear(); return false; } result.emplace_back(toPrefixRelativeRange(r->range() & resolvedRange, tenant.prefix), r->value()); if (result.size() == limit || begin == end) { break; } if (reverse) --end; else ++begin; } return true; } Reference DatabaseContext::setCachedLocation(const KeyRangeRef& absoluteKeys, const std::vector& servers) { std::vector>> serverRefs; serverRefs.reserve(servers.size()); for (const auto& interf : servers) { serverRefs.push_back(StorageServerInfo::getInterface(this, interf, clientLocality)); } int maxEvictionAttempts = 100, attempts = 0; auto loc = makeReference(serverRefs); while (locationCache.size() > locationCacheSize && attempts < maxEvictionAttempts) { CODE_PROBE(true, "NativeAPI storage server locationCache entry evicted"); attempts++; auto r = locationCache.randomRange(); Key begin = r.begin(), end = r.end(); // insert invalidates r, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); } locationCache.insert(absoluteKeys, loc); return loc; } void DatabaseContext::invalidateCache(const Optional& tenantPrefix, const KeyRef& key, Reverse isBackward) { Arena arena; KeyRef resolvedKey = key; if (tenantPrefix.present() && !tenantPrefix.get().empty()) { CODE_PROBE(true, "Database context invalidate cache for tenant key"); resolvedKey = resolvedKey.withPrefix(tenantPrefix.get(), arena); } if (isBackward) { locationCache.rangeContainingKeyBefore(resolvedKey)->value() = Reference(); } else { locationCache.rangeContaining(resolvedKey)->value() = Reference(); } } void DatabaseContext::invalidateCache(const Optional& tenantPrefix, const KeyRangeRef& keys) { Arena arena; KeyRangeRef resolvedKeys = keys; if (tenantPrefix.present() && !tenantPrefix.get().empty()) { CODE_PROBE(true, "Database context invalidate cache for tenant range"); resolvedKeys = resolvedKeys.withPrefix(tenantPrefix.get(), arena); } auto rs = locationCache.intersectingRanges(resolvedKeys); Key begin = rs.begin().begin(), end = rs.end().begin(); // insert invalidates rs, so can't be passed a mere reference into it locationCache.insert(KeyRangeRef(begin, end), Reference()); } void DatabaseContext::setFailedEndpointOnHealthyServer(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { failedEndpointsOnHealthyServersInfo[endpoint] = EndpointFailureInfo{ .startTime = now(), .lastRefreshTime = now() }; } } void DatabaseContext::updateFailedEndpointRefreshTime(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { // The endpoint is not failed. Nothing to update. return; } failedEndpointsOnHealthyServersInfo[endpoint].lastRefreshTime = now(); } Optional DatabaseContext::getEndpointFailureInfo(const Endpoint& endpoint) { if (failedEndpointsOnHealthyServersInfo.find(endpoint) == failedEndpointsOnHealthyServersInfo.end()) { return Optional(); } return failedEndpointsOnHealthyServersInfo[endpoint]; } void DatabaseContext::clearFailedEndpointOnHealthyServer(const Endpoint& endpoint) { failedEndpointsOnHealthyServersInfo.erase(endpoint); } Future DatabaseContext::onProxiesChanged() { backoffDelay = 0.0; return this->proxiesChangeTrigger.onTrigger(); } bool DatabaseContext::sampleReadTags() const { double sampleRate = globalConfig->get(transactionTagSampleRate, CLIENT_KNOBS->READ_TAG_SAMPLE_RATE); return sampleRate > 0 && deterministicRandom()->random01() <= sampleRate; } bool DatabaseContext::sampleOnCost(uint64_t cost) const { double sampleCost = globalConfig->get(transactionTagSampleCost, CLIENT_KNOBS->COMMIT_SAMPLE_COST); if (sampleCost <= 0) return false; return deterministicRandom()->random01() <= (double)cost / sampleCost; } int64_t extractIntOption(Optional value, int64_t minValue, int64_t maxValue) { validateOptionValuePresent(value); if (value.get().size() != 8) { throw invalid_option_value(); } int64_t passed = *((int64_t*)(value.get().begin())); if (passed > maxValue || passed < minValue) { throw invalid_option_value(); } return passed; } uint64_t extractHexOption(StringRef value) { char* end; uint64_t id = strtoull(value.toString().c_str(), &end, 16); if (*end) throw invalid_option_value(); return id; } void DatabaseContext::setOption(FDBDatabaseOptions::Option option, Optional value) { int defaultFor = FDBDatabaseOptions::optionInfo.getMustExist(option).defaultFor; if (defaultFor >= 0) { ASSERT(FDBTransactionOptions::optionInfo.find((FDBTransactionOptions::Option)defaultFor) != FDBTransactionOptions::optionInfo.end()); TraceEvent(SevDebug, "DatabaseContextSetPersistentOption").detail("Option", option).detail("Value", value); transactionDefaults.addOption((FDBTransactionOptions::Option)defaultFor, value.castTo>()); } else { switch (option) { case FDBDatabaseOptions::LOCATION_CACHE_SIZE: locationCacheSize = (int)extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBDatabaseOptions::MACHINE_ID: clientLocality = LocalityData(clientLocality.processId(), value.present() ? Standalone(value.get()) : Optional>(), clientLocality.machineId(), clientLocality.dcId()); if (clientInfo->get().commitProxies.size()) commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; case FDBDatabaseOptions::MAX_WATCHES: maxOutstandingWatches = (int)extractIntOption(value, 0, CLIENT_KNOBS->ABSOLUTE_MAX_WATCHES); break; case FDBDatabaseOptions::DATACENTER_ID: clientLocality = LocalityData(clientLocality.processId(), clientLocality.zoneId(), clientLocality.machineId(), value.present() ? Standalone(value.get()) : Optional>()); if (clientInfo->get().commitProxies.size()) commitProxies = makeReference(clientInfo->get().commitProxies); if (clientInfo->get().grvProxies.size()) grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); server_interf.clear(); locationCache.insert(allKeys, Reference()); break; case FDBDatabaseOptions::SNAPSHOT_RYW_ENABLE: validateOptionValueNotPresent(value); snapshotRywEnabled++; break; case FDBDatabaseOptions::SNAPSHOT_RYW_DISABLE: validateOptionValueNotPresent(value); snapshotRywEnabled--; break; case FDBDatabaseOptions::USE_CONFIG_DATABASE: validateOptionValueNotPresent(value); useConfigDatabase = true; break; case FDBDatabaseOptions::TEST_CAUSAL_READ_RISKY: verifyCausalReadsProp = double(extractIntOption(value, 0, 100)) / 100.0; break; default: break; } } } void DatabaseContext::increaseWatchCounter() { if (outstandingWatches >= maxOutstandingWatches) throw too_many_watches(); ++outstandingWatches; } void DatabaseContext::decreaseWatchCounter() { --outstandingWatches; ASSERT(outstandingWatches >= 0); } Future DatabaseContext::onConnected() const { return connected; } ACTOR static Future switchConnectionRecordImpl(Reference connRecord, DatabaseContext* self) { CODE_PROBE(true, "Switch connection file"); TraceEvent("SwitchConnectionRecord") .detail("ClusterFile", connRecord->toString()) .detail("ConnectionString", connRecord->getConnectionString().toString()); // Reset state from former cluster. self->commitProxies.clear(); self->grvProxies.clear(); self->minAcceptableReadVersion = std::numeric_limits::max(); self->invalidateCache({}, allKeys); self->ssVersionVectorCache.clear(); auto clearedClientInfo = self->clientInfo->get(); clearedClientInfo.commitProxies.clear(); clearedClientInfo.grvProxies.clear(); clearedClientInfo.id = deterministicRandom()->randomUniqueID(); self->clientInfo->set(clearedClientInfo); self->connectionRecord->set(connRecord); state Database db(Reference::addRef(self)); state Transaction tr(db); loop { tr.setOption(FDBTransactionOptions::READ_LOCK_AWARE); try { TraceEvent("SwitchConnectionRecordAttemptingGRV").log(); Version v = wait(tr.getReadVersion()); TraceEvent("SwitchConnectionRecordGotRV") .detail("ReadVersion", v) .detail("MinAcceptableReadVersion", self->minAcceptableReadVersion); ASSERT(self->minAcceptableReadVersion != std::numeric_limits::max()); self->connectionFileChangedTrigger.trigger(); return Void(); } catch (Error& e) { TraceEvent("SwitchConnectionRecordError").detail("Error", e.what()); wait(tr.onError(e)); } } } Reference DatabaseContext::getConnectionRecord() { if (connectionRecord) { return connectionRecord->get(); } return Reference(); } Future DatabaseContext::switchConnectionRecord(Reference standby) { ASSERT(switchable); return switchConnectionRecordImpl(standby, this); } Future DatabaseContext::connectionFileChanged() { return connectionFileChangedTrigger.onTrigger(); } void DatabaseContext::expireThrottles() { for (auto& priorityItr : throttledTags) { for (auto tagItr = priorityItr.second.begin(); tagItr != priorityItr.second.end();) { if (tagItr->second.expired()) { CODE_PROBE(true, "Expiring client throttle"); tagItr = priorityItr.second.erase(tagItr); } else { ++tagItr; } } } } // Initialize tracing for FDB client // // connRecord is necessary for determining the local IP, which is then included in the trace // file name, and also used to annotate all trace events. // // If trace_initialize_on_setup is not set, tracing is initialized when opening a database. // In that case we can immediately determine the IP. Thus, we can use the IP in the // trace file name and annotate all events with it. // // If trace_initialize_on_setup network option is set, tracing is at first initialized without // connRecord and thus without the local IP. In that case we cannot use the local IP in the // trace file names. The IP is then provided by a repeated call to initializeClientTracing // when opening a database. All tracing events from this point are annotated with the local IP // // If tracing initialization is completed, further calls to initializeClientTracing are ignored void initializeClientTracing(Reference connRecord, Optional apiVersion) { if (!networkOptions.traceDirectory.present()) { return; } bool initialized = traceFileIsOpen(); if (initialized && (isTraceLocalAddressSet() || !connRecord)) { // Tracing initialization is completed return; } // Network must be created before initializing tracing ASSERT(g_network); Optional localAddress; if (connRecord) { auto publicIP = connRecord->getConnectionString().determineLocalSourceIP(); localAddress = NetworkAddress(publicIP, ::getpid()); } platform::ImageInfo imageInfo = platform::getImageInfo(); if (initialized) { // Tracing already initialized, just need to update the IP address setTraceLocalAddress(localAddress.get()); TraceEvent("ClientStart") .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr)) .detail("ApiVersion", apiVersion) .detail("ClientLibrary", imageInfo.fileName) .detailf("ImageOffset", "%p", imageInfo.offset) .detail("Primary", networkOptions.primaryClient) .trackLatest("ClientStart"); } else { // Initialize tracing selectTraceFormatter(networkOptions.traceFormat); selectTraceClockSource(networkOptions.traceClockSource); addUniversalTraceField("ClientDescription", format("%s-%s-%" PRIu64, networkOptions.primaryClient ? "primary" : "external", FDB_VT_VERSION, deterministicRandom()->randomUInt64())); std::string identifier = networkOptions.traceFileIdentifier; openTraceFile(localAddress, networkOptions.traceRollSize, networkOptions.traceMaxLogsSize, networkOptions.traceDirectory.get(), "trace", networkOptions.traceLogGroup, identifier, networkOptions.tracePartialFileSuffix, InitializeTraceMetrics::True); TraceEvent("ClientStart") .detail("SourceVersion", getSourceVersion()) .detail("Version", FDB_VT_VERSION) .detail("PackageName", FDB_VT_PACKAGE_NAME) .detailf("ActualTime", "%lld", DEBUG_DETERMINISM ? 0 : time(nullptr)) .detail("ApiVersion", apiVersion) .detail("ClientLibrary", imageInfo.fileName) .detailf("ImageOffset", "%p", imageInfo.offset) .detail("Primary", networkOptions.primaryClient) .trackLatest("ClientStart"); g_network->initMetrics(); FlowTransport::transport().initMetrics(); } // Initialize system monitoring once the local IP is available if (localAddress.present()) { initializeSystemMonitorMachineState(SystemMonitorMachineState(IPAddress(localAddress.get().ip))); systemMonitor(); uncancellable(recurring(&systemMonitor, CLIENT_KNOBS->SYSTEM_MONITOR_INTERVAL, TaskPriority::FlushTrace)); } } // Creates a database object that represents a connection to a cluster // This constructor uses a preallocated DatabaseContext that may have been created // on another thread Database Database::createDatabase(Reference connRecord, int apiVersion, IsInternal internal, LocalityData const& clientLocality, DatabaseContext* preallocatedDb) { if (!g_network) throw network_not_setup(); ASSERT(TraceEvent::isNetworkThread()); initializeClientTracing(connRecord, apiVersion); g_network->initTLS(); auto clientInfo = makeReference>(); auto coordinator = makeReference>>(); auto connectionRecord = makeReference>>(); connectionRecord->set(connRecord); Future clientInfoMonitor = monitorProxies(connectionRecord, clientInfo, coordinator, networkOptions.supportedVersions, StringRef(networkOptions.traceLogGroup), internal); DatabaseContext* db; if (preallocatedDb) { db = new (preallocatedDb) DatabaseContext(connectionRecord, clientInfo, coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, EnableLocalityLoadBalance::True, LockAware::False, internal, apiVersion, IsSwitchable::True); } else { db = new DatabaseContext(connectionRecord, clientInfo, coordinator, clientInfoMonitor, TaskPriority::DefaultEndpoint, clientLocality, EnableLocalityLoadBalance::True, LockAware::False, internal, apiVersion, IsSwitchable::True); } auto database = Database(db); database->globalConfig->init(Reference const>(clientInfo), std::addressof(clientInfo->get())); database->globalConfig->trigger(samplingFrequency, samplingProfilerUpdateFrequency); database->globalConfig->trigger(samplingWindow, samplingProfilerUpdateWindow); TraceEvent("ConnectToDatabase", database->dbId) .detail("Version", FDB_VT_VERSION) .detail("ClusterFile", connRecord ? connRecord->toString() : "None") .detail("ConnectionString", connRecord ? connRecord->getConnectionString().toString() : "None") .detail("ClientLibrary", platform::getImageInfo().fileName) .detail("Primary", networkOptions.primaryClient) .detail("Internal", internal) .trackLatest(database->connectToDatabaseEventCacheHolder.trackingKey); return database; } Database Database::createDatabase(std::string connFileName, int apiVersion, IsInternal internal, LocalityData const& clientLocality) { Reference rccr = ClusterConnectionFile::openOrDefault(connFileName); return Database::createDatabase(rccr, apiVersion, internal, clientLocality); } Database Database::createSimulatedExtraDatabase(std::string connectionString, Optional defaultTenant) { auto extraFile = makeReference(ClusterConnectionString(connectionString)); Database db = Database::createDatabase(extraFile, ApiVersion::LATEST_VERSION); db->defaultTenant = defaultTenant; return db; } const UniqueOrderedOptionList& Database::getTransactionDefaults() const { ASSERT(db); return db->transactionDefaults; } void setNetworkOption(FDBNetworkOptions::Option option, Optional value) { std::regex identifierRegex("^[a-zA-Z0-9_]*$"); switch (option) { // SOMEDAY: If the network is already started, should these five throw an error? case FDBNetworkOptions::TRACE_ENABLE: networkOptions.traceDirectory = value.present() ? value.get().toString() : ""; break; case FDBNetworkOptions::TRACE_ROLL_SIZE: validateOptionValuePresent(value); networkOptions.traceRollSize = extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBNetworkOptions::TRACE_MAX_LOGS_SIZE: validateOptionValuePresent(value); networkOptions.traceMaxLogsSize = extractIntOption(value, 0, std::numeric_limits::max()); break; case FDBNetworkOptions::TRACE_FORMAT: validateOptionValuePresent(value); networkOptions.traceFormat = value.get().toString(); if (!validateTraceFormat(networkOptions.traceFormat)) { fprintf(stderr, "Unrecognized trace format: `%s'\n", networkOptions.traceFormat.c_str()); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_FILE_IDENTIFIER: validateOptionValuePresent(value); networkOptions.traceFileIdentifier = value.get().toString(); if (networkOptions.traceFileIdentifier.length() > CLIENT_KNOBS->TRACE_LOG_FILE_IDENTIFIER_MAX_LENGTH) { fprintf(stderr, "Trace file identifier provided is too long.\n"); throw invalid_option_value(); } else if (!std::regex_match(networkOptions.traceFileIdentifier, identifierRegex)) { fprintf(stderr, "Trace file identifier should only contain alphanumerics and underscores.\n"); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_LOG_GROUP: if (value.present()) { if (traceFileIsOpen()) { setTraceLogGroup(value.get().toString()); } else { networkOptions.traceLogGroup = value.get().toString(); } } break; case FDBNetworkOptions::TRACE_CLOCK_SOURCE: validateOptionValuePresent(value); networkOptions.traceClockSource = value.get().toString(); if (!validateTraceClockSource(networkOptions.traceClockSource)) { fprintf(stderr, "Unrecognized trace clock source: `%s'\n", networkOptions.traceClockSource.c_str()); throw invalid_option_value(); } break; case FDBNetworkOptions::TRACE_PARTIAL_FILE_SUFFIX: validateOptionValuePresent(value); networkOptions.tracePartialFileSuffix = value.get().toString(); break; case FDBNetworkOptions::TRACE_INITIALIZE_ON_SETUP: networkOptions.traceInitializeOnSetup = true; break; case FDBNetworkOptions::KNOB: { validateOptionValuePresent(value); std::string optionValue = value.get().toString(); TraceEvent("SetKnob").detail("KnobString", optionValue); size_t eq = optionValue.find_first_of('='); if (eq == optionValue.npos) { TraceEvent(SevWarnAlways, "InvalidKnobString").detail("KnobString", optionValue); throw invalid_option_value(); } std::string knobName = optionValue.substr(0, eq); std::string knobValueString = optionValue.substr(eq + 1); try { auto knobValue = IKnobCollection::parseKnobValue(knobName, knobValueString, IKnobCollection::Type::CLIENT); if (g_network) { IKnobCollection::getMutableGlobalKnobCollection().setKnob(knobName, knobValue); } else { networkOptions.knobs[knobName] = knobValue; } } catch (Error& e) { TraceEvent(SevWarnAlways, "UnrecognizedKnob").detail("Knob", knobName.c_str()); fprintf(stderr, "FoundationDB client ignoring unrecognized knob option '%s'\n", knobName.c_str()); } break; } case FDBNetworkOptions::TLS_PLUGIN: validateOptionValuePresent(value); break; case FDBNetworkOptions::TLS_CERT_PATH: validateOptionValuePresent(value); tlsConfig.setCertificatePath(value.get().toString()); break; case FDBNetworkOptions::TLS_CERT_BYTES: { validateOptionValuePresent(value); tlsConfig.setCertificateBytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_CA_PATH: { validateOptionValuePresent(value); tlsConfig.setCAPath(value.get().toString()); break; } case FDBNetworkOptions::TLS_CA_BYTES: { validateOptionValuePresent(value); tlsConfig.setCABytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_PASSWORD: validateOptionValuePresent(value); tlsConfig.setPassword(value.get().toString()); break; case FDBNetworkOptions::TLS_KEY_PATH: validateOptionValuePresent(value); tlsConfig.setKeyPath(value.get().toString()); break; case FDBNetworkOptions::TLS_KEY_BYTES: { validateOptionValuePresent(value); tlsConfig.setKeyBytes(value.get().toString()); break; } case FDBNetworkOptions::TLS_VERIFY_PEERS: validateOptionValuePresent(value); tlsConfig.clearVerifyPeers(); tlsConfig.addVerifyPeers(value.get().toString()); break; case FDBNetworkOptions::TLS_DISABLE_PLAINTEXT_CONNECTION: tlsConfig.setDisablePlainTextConnection(true); break; case FDBNetworkOptions::CLIENT_BUGGIFY_ENABLE: enableClientBuggify(); break; case FDBNetworkOptions::CLIENT_BUGGIFY_DISABLE: disableClientBuggify(); break; case FDBNetworkOptions::CLIENT_BUGGIFY_SECTION_ACTIVATED_PROBABILITY: validateOptionValuePresent(value); clearClientBuggifySections(); P_CLIENT_BUGGIFIED_SECTION_ACTIVATED = double(extractIntOption(value, 0, 100)) / 100.0; break; case FDBNetworkOptions::CLIENT_BUGGIFY_SECTION_FIRED_PROBABILITY: validateOptionValuePresent(value); P_CLIENT_BUGGIFIED_SECTION_FIRES = double(extractIntOption(value, 0, 100)) / 100.0; break; case FDBNetworkOptions::DISABLE_CLIENT_STATISTICS_LOGGING: validateOptionValueNotPresent(value); networkOptions.logClientInfo = false; break; case FDBNetworkOptions::SUPPORTED_CLIENT_VERSIONS: { // The multi-version API should be providing us these guarantees ASSERT(g_network); ASSERT(value.present()); Standalone> supportedVersions; std::vector supportedVersionsStrings = value.get().splitAny(";"_sr); for (StringRef versionString : supportedVersionsStrings) { #ifdef ADDRESS_SANITIZER __lsan_disable(); #endif // LSAN reports that we leak this allocation in client // tests, but I cannot seem to figure out why. AFAICT // it's not actually leaking. If it is a leak, it's only a few bytes. supportedVersions.push_back_deep(supportedVersions.arena(), ClientVersionRef(versionString)); #ifdef ADDRESS_SANITIZER __lsan_enable(); #endif } ASSERT(supportedVersions.size() > 0); networkOptions.supportedVersions->set(supportedVersions); break; } case FDBNetworkOptions::ENABLE_RUN_LOOP_PROFILING: // Same as ENABLE_SLOW_TASK_PROFILING validateOptionValueNotPresent(value); networkOptions.runLoopProfilingEnabled = true; break; case FDBNetworkOptions::DISTRIBUTED_CLIENT_TRACER: { validateOptionValuePresent(value); std::string tracer = value.get().toString(); if (tracer == "none" || tracer == "disabled") { openTracer(TracerType::DISABLED); } else if (tracer == "logfile" || tracer == "file" || tracer == "log_file") { openTracer(TracerType::LOG_FILE); } else if (tracer == "network_lossy") { openTracer(TracerType::NETWORK_LOSSY); } else { fprintf(stderr, "ERROR: Unknown or unsupported tracer: `%s'", tracer.c_str()); throw invalid_option_value(); } break; } case FDBNetworkOptions::EXTERNAL_CLIENT: networkOptions.primaryClient = false; break; default: break; } } // update the network busyness on a 1s cadence ACTOR Future monitorNetworkBusyness() { state double prevTime = now(); loop { wait(delay(CLIENT_KNOBS->NETWORK_BUSYNESS_MONITOR_INTERVAL, TaskPriority::FlushTrace)); double elapsed = now() - prevTime; // get elapsed time from last execution prevTime = now(); struct NetworkMetrics::PriorityStats& tracker = g_network->networkInfo.metrics.starvationTrackerNetworkBusyness; if (tracker.active) { // update metrics tracker.duration += now() - tracker.windowedTimer; tracker.maxDuration = std::max(tracker.maxDuration, now() - tracker.timer); tracker.windowedTimer = now(); } double busyFraction = std::min(elapsed, tracker.duration) / elapsed; // The burstiness score is an indicator of the maximum busyness spike over the measurement interval. // It scales linearly from 0 to 1 as the largest burst goes from the start to the saturation threshold. // This allows us to account for saturation that happens in smaller bursts than the measurement interval. // // Burstiness will not be calculated if the saturation threshold is smaller than the start threshold or // if either value is negative. double burstiness = 0; if (CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD >= 0 && CLIENT_KNOBS->BUSYNESS_SPIKE_SATURATED_THRESHOLD >= CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD) { burstiness = std::min(1.0, std::max(0.0, tracker.maxDuration - CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD) / std::max(1e-6, CLIENT_KNOBS->BUSYNESS_SPIKE_SATURATED_THRESHOLD - CLIENT_KNOBS->BUSYNESS_SPIKE_START_THRESHOLD)); } g_network->networkInfo.metrics.networkBusyness = std::max(busyFraction, burstiness); tracker.duration = 0; tracker.maxDuration = 0; } } static void setupGlobalKnobs() { IKnobCollection::setGlobalKnobCollection(IKnobCollection::Type::CLIENT, Randomize::False, IsSimulated::False); for (const auto& [knobName, knobValue] : networkOptions.knobs) { IKnobCollection::getMutableGlobalKnobCollection().setKnob(knobName, knobValue); } } // Setup g_network and start monitoring for network busyness void setupNetwork(uint64_t transportId, UseMetrics useMetrics) { if (g_network) throw network_already_setup(); if (!networkOptions.logClientInfo.present()) networkOptions.logClientInfo = true; setupGlobalKnobs(); g_network = newNet2(tlsConfig, false, useMetrics || networkOptions.traceDirectory.present()); g_network->addStopCallback(Net2FileSystem::stop); FlowTransport::createInstance(true, transportId, WLTOKEN_RESERVED_COUNT); Net2FileSystem::newFileSystem(); if (networkOptions.traceInitializeOnSetup) { ::initializeClientTracing({}, {}); } uncancellable(monitorNetworkBusyness()); } void runNetwork() { if (!g_network) { throw network_not_setup(); } if (!g_network->checkRunnable()) { throw network_cannot_be_restarted(); } if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { setupRunLoopProfiler(); } g_network->run(); if (networkOptions.traceDirectory.present()) systemMonitor(); } void stopNetwork() { if (!g_network) throw network_not_setup(); TraceEvent("ClientStopNetwork").log(); if (networkOptions.traceDirectory.present() && networkOptions.runLoopProfilingEnabled) { stopRunLoopProfiler(); } g_network->stop(); } void DatabaseContext::updateProxies() { if (proxiesLastChange == clientInfo->get().id) return; proxiesLastChange = clientInfo->get().id; commitProxies.clear(); grvProxies.clear(); ssVersionVectorCache.clear(); bool commitProxyProvisional = false, grvProxyProvisional = false; if (clientInfo->get().commitProxies.size()) { commitProxies = makeReference(clientInfo->get().commitProxies); commitProxyProvisional = clientInfo->get().commitProxies[0].provisional; } if (clientInfo->get().grvProxies.size()) { grvProxies = makeReference(clientInfo->get().grvProxies, BalanceOnRequests::True); grvProxyProvisional = clientInfo->get().grvProxies[0].provisional; } if (clientInfo->get().commitProxies.size() && clientInfo->get().grvProxies.size()) { ASSERT(commitProxyProvisional == grvProxyProvisional); proxyProvisional = commitProxyProvisional; } } Reference DatabaseContext::getCommitProxies(UseProvisionalProxies useProvisionalProxies) { updateProxies(); if (proxyProvisional && !useProvisionalProxies) { return Reference(); } return commitProxies; } Reference DatabaseContext::getGrvProxies(UseProvisionalProxies useProvisionalProxies) { updateProxies(); if (proxyProvisional && !useProvisionalProxies) { return Reference(); } return grvProxies; } bool DatabaseContext::isCurrentGrvProxy(UID proxyId) const { for (const auto& proxy : clientInfo->get().grvProxies) { if (proxy.id() == proxyId) return true; } CODE_PROBE(true, "stale GRV proxy detected", probe::decoration::rare); return false; } // Actor which will wait until the MultiInterface returned by the DatabaseContext cx is not // nullptr ACTOR Future> getCommitProxiesFuture(DatabaseContext* cx, UseProvisionalProxies useProvisionalProxies) { loop { Reference commitProxies = cx->getCommitProxies(useProvisionalProxies); if (commitProxies) return commitProxies; wait(cx->onProxiesChanged()); } } // Returns a future which will not be set until the CommitProxyInfo of this DatabaseContext is not nullptr Future> DatabaseContext::getCommitProxiesFuture( UseProvisionalProxies useProvisionalProxies) { return ::getCommitProxiesFuture(this, useProvisionalProxies); } void GetRangeLimits::decrement(VectorRef const& data) { if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) { ASSERT(data.size() <= rows); rows -= data.size(); } minRows = std::max(0, minRows - data.size()); if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)data.expectedSize() - (8 - (int)sizeof(KeyValueRef)) * data.size()); } void GetRangeLimits::decrement(KeyValueRef const& data) { minRows = std::max(0, minRows - 1); if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) rows--; if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)8 - (int)data.expectedSize()); } void GetRangeLimits::decrement(VectorRef const& data) { if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) { ASSERT(data.size() <= rows); rows -= data.size(); } minRows = std::max(0, minRows - data.size()); // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)data.expectedSize() - (8 - (int)sizeof(MappedKeyValueRef)) * data.size()); } void GetRangeLimits::decrement(MappedKeyValueRef const& data) { minRows = std::max(0, minRows - 1); if (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED) rows--; // TODO: For now, expectedSize only considers the size of the original key values, but not the underlying queries or // results. Also, double check it is correct when dealing with sizeof(MappedKeyValueRef). if (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED) bytes = std::max(0, bytes - (int)8 - (int)data.expectedSize()); } // True if either the row or byte limit has been reached bool GetRangeLimits::isReached() const { return rows == 0 || (bytes == 0 && minRows == 0); } // True if data would cause the row or byte limit to be reached bool GetRangeLimits::reachedBy(VectorRef const& data) const { return (rows != GetRangeLimits::ROW_LIMIT_UNLIMITED && data.size() >= rows) || (bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED && (int)data.expectedSize() + (8 - (int)sizeof(KeyValueRef)) * data.size() >= bytes && data.size() >= minRows); } bool GetRangeLimits::hasByteLimit() const { return bytes != GetRangeLimits::BYTE_LIMIT_UNLIMITED; } bool GetRangeLimits::hasRowLimit() const { return rows != GetRangeLimits::ROW_LIMIT_UNLIMITED; } bool GetRangeLimits::hasSatisfiedMinRows() const { return hasByteLimit() && minRows == 0; } AddressExclusion AddressExclusion::parse(StringRef const& key) { // Must not change: serialized to the database! auto parsedIp = IPAddress::parse(key.toString()); if (parsedIp.present()) { return AddressExclusion(parsedIp.get()); } // Not a whole machine, includes `port'. try { auto addr = NetworkAddress::parse(key.toString()); if (addr.isTLS()) { TraceEvent(SevWarnAlways, "AddressExclusionParseError") .detail("String", key) .detail("Description", "Address inclusion string should not include `:tls' suffix."); return AddressExclusion(); } return AddressExclusion(addr.ip, addr.port); } catch (Error&) { TraceEvent(SevWarnAlways, "AddressExclusionParseError").detail("String", key); return AddressExclusion(); } } Tenant::Tenant(Database cx, TenantName name) : idFuture(cx->lookupTenant(name)), name(name) {} Tenant::Tenant(int64_t id) : idFuture(id) {} Tenant::Tenant(Future id, Optional name) : idFuture(id), name(name) {} int64_t Tenant::id() const { ASSERT(idFuture.isReady()); return idFuture.get(); } Future Tenant::getIdFuture() const { return idFuture; } KeyRef Tenant::prefix() const { ASSERT(idFuture.isReady()); if (bigEndianId == -1) { bigEndianId = bigEndian64(idFuture.get()); } return StringRef(reinterpret_cast(&bigEndianId), TenantAPI::PREFIX_SIZE); } std::string Tenant::description() const { StringRef nameStr = name.castTo().orDefault(""_sr); if (idFuture.canGet()) { return format("%.*s (%lld)", nameStr.size(), nameStr.begin(), idFuture.get()); } else { return format("%.*s", nameStr.size(), nameStr.begin()); } } Future> getValue(Reference const& trState, Key const& key, UseTenant const& useTenant = UseTenant::True, TransactionRecordLogInfo const& recordLogInfo = TransactionRecordLogInfo::True); Future getRange(Reference const& trState, KeySelector const& begin, KeySelector const& end, GetRangeLimits const& limits, Reverse const& reverse, UseTenant const& useTenant); ACTOR Future> fetchServerInterface(Reference trState, UID id) { Optional val = wait(getValue(trState, serverListKeyFor(id), UseTenant::False, TransactionRecordLogInfo::False)); if (!val.present()) { // A storage server has been removed from serverList since we read keyServers return Optional(); } return decodeServerListValue(val.get()); } ACTOR Future>> transactionalGetServerInterfaces( Reference trState, std::vector ids) { state std::vector>> serverListEntries; serverListEntries.reserve(ids.size()); for (int s = 0; s < ids.size(); s++) { serverListEntries.push_back(fetchServerInterface(trState, ids[s])); } std::vector> serverListValues = wait(getAll(serverListEntries)); std::vector serverInterfaces; for (int s = 0; s < serverListValues.size(); s++) { if (!serverListValues[s].present()) { // A storage server has been removed from ServerList since we read keyServers return Optional>(); } serverInterfaces.push_back(serverListValues[s].get()); } return serverInterfaces; } void updateTssMappings(Database cx, const GetKeyServerLocationsReply& reply) { // Since a ss -> tss mapping is included in resultsTssMapping iff that SS is in results and has a tss pair, // all SS in results that do not have a mapping present must not have a tss pair. std::unordered_map ssiById; for (const auto& [_, shard] : reply.results) { for (auto& ssi : shard) { ssiById[ssi.id()] = &ssi; } } for (const auto& mapping : reply.resultsTssMapping) { auto ssi = ssiById.find(mapping.first); ASSERT(ssi != ssiById.end()); cx->addTssMapping(*ssi->second, mapping.second); ssiById.erase(mapping.first); } // if SS didn't have a mapping above, it's still in the ssiById map, so remove its tss mapping for (const auto& it : ssiById) { cx->removeTssMapping(*it.second); } } void updateTagMappings(Database cx, const GetKeyServerLocationsReply& reply) { for (const auto& mapping : reply.resultsTagMapping) { cx->addSSIdTagMapping(mapping.first, mapping.second); } } // If isBackward == true, returns the shard containing the key before 'key' (an infinitely long, inexpressible key). // Otherwise returns the shard containing key ACTOR Future getKeyLocation_internal(Database cx, TenantInfo tenant, Key key, SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { state Span span("NAPI:getKeyLocation"_loc, spanContext); if (isBackward) { ASSERT(key != allKeys.begin && key <= allKeys.end); } else { ASSERT(key < allKeys.end); } if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.Before"); loop { try { wait(cx->getBackoff()); ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetKeyServerLocationsReply rep = wait(basicLoadBalance( cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest( span.context, tenant, key, Optional(), 100, isBackward, version, key.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocation.After"); ASSERT(rep.results.size() == 1); auto locationInfo = cx->setCachedLocation(rep.results[0].first, rep.results[0].second); updateTssMappings(cx, rep); updateTagMappings(cx, rep); cx->updateBackoff(success()); return KeyRangeLocationInfo( KeyRange(toPrefixRelativeRange(rep.results[0].first, tenant.prefix), rep.arena), locationInfo); } } } catch (Error& e) { if (e.code() == error_code_commit_proxy_memory_limit_exceeded) { // Eats commit_proxy_memory_limit_exceeded error from commit proxies TraceEvent(SevWarnAlways, "CommitProxyOverloadedForKeyLocation").suppressFor(5); cx->updateBackoff(e); continue; } throw; } } } // Checks if `endpoint` is failed on a healthy server or not. Returns true if we need to refresh the location cache for // the endpoint. bool checkOnlyEndpointFailed(const Database& cx, const Endpoint& endpoint) { if (IFailureMonitor::failureMonitor().onlyEndpointFailed(endpoint)) { // This endpoint is failed, but the server is still healthy. There are two cases this can happen: // - There is a recent bounce in the cluster where the endpoints in SSes get updated. // - The SS is failed and terminated on a server, but the server is kept running. // To account for the first case, we invalidate the cache and issue GetKeyLocation requests to the proxy to // update the cache with the new SS points. However, if the failure is caused by the second case, the // requested key location will continue to be the failed endpoint until the data movement is finished. But // every read will generate a GetKeyLocation request to the proxies (and still getting the failed endpoint // back), which may overload the proxy and affect data movement speed. Therefore, we only refresh the // location cache for short period of time, and after the initial grace period that we keep retrying // resolving key location, we will slow it down to resolve it only once every // `LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL`. cx->setFailedEndpointOnHealthyServer(endpoint); const auto& failureInfo = cx->getEndpointFailureInfo(endpoint); ASSERT(failureInfo.present()); if (now() - failureInfo.get().startTime < CLIENT_KNOBS->LOCATION_CACHE_ENDPOINT_FAILURE_GRACE_PERIOD || now() - failureInfo.get().lastRefreshTime > CLIENT_KNOBS->LOCATION_CACHE_FAILED_ENDPOINT_RETRY_INTERVAL) { cx->updateFailedEndpointRefreshTime(endpoint); return true; } } else { cx->clearFailedEndpointOnHealthyServer(endpoint); } return false; } template Future getKeyLocation(Database const& cx, TenantInfo const& tenant, Key const& key, F StorageServerInterface::*member, SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Reverse isBackward, Version version) { // we first check whether this range is cached Optional locationInfo = cx->getCachedLocation(tenant, key, isBackward); if (!locationInfo.present()) { return getKeyLocation_internal( cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.get().locations->size(); i++) { if (checkOnlyEndpointFailed(cx, locationInfo.get().locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { cx->invalidateCache(tenant.prefix, key); // Refresh the cache with a new getKeyLocations made to proxies. return getKeyLocation_internal( cx, tenant, key, spanContext, debugID, useProvisionalProxies, isBackward, version); } return locationInfo.get(); } template Future getKeyLocation(Reference trState, Key const& key, F StorageServerInterface::*member, Reverse isBackward, UseTenant useTenant) { CODE_PROBE(!useTenant, "Get key location ignoring tenant"); return getKeyLocation(trState->cx, useTenant ? trState->getTenantInfo() : TenantInfo(), key, member, trState->spanContext, trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, isBackward, trState->readVersionFuture.isValid() && trState->readVersionFuture.isReady() ? trState->readVersion() : latestVersion); } void DatabaseContext::updateBackoff(const Error& err) { switch (err.code()) { case error_code_success: backoffDelay = backoffDelay / CLIENT_KNOBS->BACKOFF_GROWTH_RATE; if (backoffDelay < CLIENT_KNOBS->DEFAULT_BACKOFF) { backoffDelay = 0.0; } break; case error_code_commit_proxy_memory_limit_exceeded: ++transactionsResourceConstrained; if (backoffDelay == 0.0) { backoffDelay = CLIENT_KNOBS->DEFAULT_BACKOFF; } else { backoffDelay = std::min(backoffDelay * CLIENT_KNOBS->BACKOFF_GROWTH_RATE, CLIENT_KNOBS->RESOURCE_CONSTRAINED_MAX_BACKOFF); } break; default: ASSERT_WE_THINK(false); } } ACTOR Future> getKeyRangeLocations_internal( Database cx, TenantInfo tenant, KeyRange keys, int limit, Reverse reverse, SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Version version) { state Span span("NAPI:getKeyRangeLocations"_loc, spanContext); if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.Before"); loop { try { wait(cx->getBackoff()); ++cx->transactionKeyServerLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetKeyServerLocationsReply _rep = wait(basicLoadBalance( cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getKeyServersLocations, GetKeyServerLocationsRequest( span.context, tenant, keys.begin, keys.end, limit, reverse, version, keys.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionKeyServerLocationRequestsCompleted; state GetKeyServerLocationsReply rep = _rep; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getKeyLocations.After"); ASSERT(rep.results.size()); state std::vector results; state int shard = 0; for (; shard < rep.results.size(); shard++) { // FIXME: these shards are being inserted into the map sequentially, it would be much more CPU // efficient to save the map pairs and insert them all at once. results.emplace_back( (toPrefixRelativeRange(rep.results[shard].first, tenant.prefix) & keys), cx->setCachedLocation(rep.results[shard].first, rep.results[shard].second)); wait(yield()); } updateTssMappings(cx, rep); updateTagMappings(cx, rep); cx->updateBackoff(success()); return results; } } } catch (Error& e) { if (e.code() == error_code_commit_proxy_memory_limit_exceeded) { // Eats commit_proxy_memory_limit_exceeded error from commit proxies TraceEvent(SevWarnAlways, "CommitProxyOverloadedForRangeLocation").suppressFor(5); cx->updateBackoff(e); continue; } throw; } } } // Get the SS locations for each shard in the 'keys' key-range; // Returned vector size is the number of shards in the input keys key-range. // Returned vector element is pairs, where // ShardRange is the whole shard key-range, not a part of the given key range. // Example: If query the function with key range (b, d), the returned list of pairs could be something like: // [([a, b1), locationInfo), ([b1, c), locationInfo), ([c, d1), locationInfo)]. template Future> getKeyRangeLocations(Database const& cx, TenantInfo const& tenant, KeyRange const& keys, int limit, Reverse reverse, F StorageServerInterface::*member, SpanContext const& spanContext, Optional const& debugID, UseProvisionalProxies useProvisionalProxies, Version version) { ASSERT(!keys.empty()); std::vector locations; if (!cx->getCachedLocations(tenant, keys, locations, limit, reverse)) { return getKeyRangeLocations_internal( cx, tenant, keys, limit, reverse, spanContext, debugID, useProvisionalProxies, version); } bool foundFailed = false; for (const auto& locationInfo : locations) { bool onlyEndpointFailedAndNeedRefresh = false; for (int i = 0; i < locationInfo.locations->size(); i++) { if (checkOnlyEndpointFailed(cx, locationInfo.locations->get(i, member).getEndpoint())) { onlyEndpointFailedAndNeedRefresh = true; } } if (onlyEndpointFailedAndNeedRefresh) { cx->invalidateCache(tenant.prefix, locationInfo.range.begin); foundFailed = true; } } if (foundFailed) { // Refresh the cache with a new getKeyRangeLocations made to proxies. return getKeyRangeLocations_internal( cx, tenant, keys, limit, reverse, spanContext, debugID, useProvisionalProxies, version); } return locations; } template Future> getKeyRangeLocations(Reference trState, KeyRange const& keys, int limit, Reverse reverse, F StorageServerInterface::*member, UseTenant useTenant) { CODE_PROBE(!useTenant, "Get key range locations ignoring tenant"); return getKeyRangeLocations(trState->cx, useTenant ? trState->getTenantInfo(AllowInvalidTenantID::True) : TenantInfo(), keys, limit, reverse, member, trState->spanContext, trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, trState->readVersionFuture.isValid() && trState->readVersionFuture.isReady() ? trState->readVersion() : latestVersion); } ACTOR Future>> getBlobGranuleLocations_internal( Database cx, TenantInfo tenant, KeyRange keys, int limit, Reverse reverse, JustGranules justGranules, SpanContext spanContext, Optional debugID, UseProvisionalProxies useProvisionalProxies, Version version, bool* more) { state Span span("NAPI:getBlobGranuleLocations"_loc, spanContext); if (debugID.present()) g_traceBatch.addEvent("TransactionDebug", debugID.get().first(), "NativeAPI.getBlobGranuleLocations.Before"); CODE_PROBE(tenant.hasTenant(), "NativeAPI getBlobGranuleLocations has tenant"); loop { ++cx->transactionBlobGranuleLocationRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetBlobGranuleLocationsReply _rep = wait(basicLoadBalance(cx->getCommitProxies(useProvisionalProxies), &CommitProxyInterface::getBlobGranuleLocations, GetBlobGranuleLocationsRequest(span.context, tenant, keys.begin, keys.end, limit, reverse, justGranules, version, keys.arena()), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionBlobGranuleLocationRequestsCompleted; state GetBlobGranuleLocationsReply rep = _rep; if (debugID.present()) g_traceBatch.addEvent( "TransactionDebug", debugID.get().first(), "NativeAPI.getBlobGranuleLocations.After"); // if justGranules, we can get an empty mapping, otherwise, an empty mapping should have been an error ASSERT(justGranules || rep.results.size()); ASSERT(!rep.more || !rep.results.empty()); *more = rep.more; state std::vector> results; state int granule = 0; for (auto& bwInterf : rep.bwInterfs) { cx->blobWorker_interf.insert({ bwInterf.id(), bwInterf }); } for (; granule < rep.results.size(); granule++) { // FIXME: cache mapping? KeyRange range(toPrefixRelativeRange(rep.results[granule].first, tenant.prefix)); if (!justGranules) { range = range & keys; } results.emplace_back(range, rep.results[granule].second); wait(yield()); } return results; } } } } // Get the Blob Worker locations for each granule in the 'keys' key-range, similar to getKeyRangeLocations Future>> getBlobGranuleLocations(Database const& cx, TenantInfo const& tenant, KeyRange const& keys, int limit, Reverse reverse, JustGranules justGranules, SpanContext const& spanContext, Optional const& debugID, UseProvisionalProxies useProvisionalProxies, Version version, bool* more) { ASSERT(!keys.empty()); // FIXME: wrap this with location caching for blob workers like getKeyRangeLocations has return getBlobGranuleLocations_internal( cx, tenant, keys, limit, reverse, justGranules, spanContext, debugID, useProvisionalProxies, version, more); } Future>> getBlobGranuleLocations(Reference trState, KeyRange const& keys, int limit, Reverse reverse, UseTenant useTenant, JustGranules justGranules, bool* more) { return getBlobGranuleLocations( trState->cx, useTenant ? trState->getTenantInfo(AllowInvalidTenantID::True) : TenantInfo(), keys, limit, reverse, justGranules, trState->spanContext, trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, trState->readVersionFuture.isValid() && trState->readVersionFuture.isReady() ? trState->readVersion() : latestVersion, more); } ACTOR Future warmRange_impl(Reference trState, KeyRange keys) { state int totalRanges = 0; state int totalRequests = 0; wait(trState->startTransaction()); loop { std::vector locations = wait(getKeyRangeLocations_internal( trState->cx, trState->getTenantInfo(), keys, CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT, Reverse::False, trState->spanContext, trState->readOptions.present() ? trState->readOptions.get().debugID : Optional(), trState->useProvisionalProxies, trState->readVersion())); totalRanges += CLIENT_KNOBS->WARM_RANGE_SHARD_LIMIT; totalRequests++; if (locations.size() == 0 || totalRanges >= trState->cx->locationCacheSize || locations[locations.size() - 1].range.end >= keys.end) break; keys = KeyRangeRef(locations[locations.size() - 1].range.end, keys.end); if (totalRequests % 20 == 0) { // To avoid blocking the proxies from starting other transactions, occasionally get a read version. state Transaction tr(trState->cx, trState->tenant()); loop { try { tr.setOption(FDBTransactionOptions::LOCK_AWARE); tr.setOption(FDBTransactionOptions::CAUSAL_READ_RISKY); wait(success(tr.getReadVersion())); break; } catch (Error& e) { wait(tr.onError(e)); } } } } return Void(); } SpanContext generateSpanID(bool transactionTracingSample, SpanContext parentContext = SpanContext()) { if (parentContext.isValid()) { return SpanContext(parentContext.traceID, deterministicRandom()->randomUInt64(), parentContext.m_Flags); } if (transactionTracingSample) { return SpanContext(deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUInt64(), deterministicRandom()->random01() <= FLOW_KNOBS->TRACING_SAMPLE_RATE ? TraceFlags::sampled : TraceFlags::unsampled); } return SpanContext( deterministicRandom()->randomUniqueID(), deterministicRandom()->randomUInt64(), TraceFlags::unsampled); } ACTOR Future lookupTenantImpl(DatabaseContext* cx, TenantName tenant) { loop { try { wait(cx->getBackoff()); ++cx->transactionTenantLookupRequests; choose { when(wait(cx->onProxiesChanged())) {} when(GetTenantIdReply rep = wait(basicLoadBalance(cx->getCommitProxies(UseProvisionalProxies::False), &CommitProxyInterface::getTenantId, GetTenantIdRequest(tenant, latestVersion), TaskPriority::DefaultPromiseEndpoint))) { ++cx->transactionTenantLookupRequestsCompleted; cx->updateBackoff(success()); return rep.tenantId; } } } catch (Error& e) { if (e.code() == error_code_commit_proxy_memory_limit_exceeded) { CODE_PROBE(true, "Lookup tenant memory limit exceeded"); TraceEvent(SevWarnAlways, "CommitProxyOverloadedForTenant").suppressFor(5); // Eats commit_proxy_memory_limit_exceeded error from commit proxies cx->updateBackoff(e); continue; } throw; } } } Future DatabaseContext::lookupTenant(TenantName tenant) { return lookupTenantImpl(this, tenant); } TransactionState::TransactionState(Database cx, Optional> tenant, TaskPriority taskID, SpanContext spanContext, Reference trLogInfo) : cx(cx), trLogInfo(trLogInfo), options(cx), taskID(taskID), spanContext(spanContext), readVersionObtainedFromGrvProxy(true), tenant_(tenant), tenantSet(tenant.present()) {} Reference TransactionState::cloneAndReset(Reference newTrLogInfo, bool generateNewSpan) const { SpanContext newSpanContext = generateNewSpan ? generateSpanID(cx->transactionTracingSample) : spanContext; Reference newState = makeReference(cx, tenant_, cx->taskID, newSpanContext, newTrLogInfo); if (!cx->apiVersionAtLeast(16)) { newState->options = options; } newState->readVersionFuture = Future(); newState->metadataVersion = Promise>(); newState->numErrors = numErrors; newState->startTime = startTime; newState->committedVersion = committedVersion; newState->conflictingKeys = conflictingKeys; newState->tenantSet = tenantSet; return newState; } TenantInfo TransactionState::getTenantInfo(AllowInvalidTenantID allowInvalidTenantId /* = false */) { Optional> const& t = tenant(); if (options.rawAccess) { CODE_PROBE(true, "Get tenant info raw access transaction"); return TenantInfo(); } else if (!cx->internal && cx->clientInfo->get().clusterType == ClusterType::METACLUSTER_MANAGEMENT) { CODE_PROBE(true, "Get tenant info invalid management cluster access", probe::decoration::rare); throw management_cluster_invalid_access(); } else if (!cx->internal && cx->clientInfo->get().tenantMode == TenantMode::REQUIRED && !t.present()) { CODE_PROBE(true, "Get tenant info tenant name required", probe::decoration::rare); throw tenant_name_required(); } else if (!t.present()) { CODE_PROBE(true, "Get tenant info without tenant"); return TenantInfo(); } else if (cx->clientInfo->get().tenantMode == TenantMode::DISABLED && t.present()) { // If we are running provisional proxies, we allow a tenant request to go through since we don't know the tenant // mode. Such a transaction would not be allowed to commit without enabling provisional commits because either // the commit proxies will be provisional or the read version will be too old. if (!cx->clientInfo->get().grvProxies.empty() && !cx->clientInfo->get().grvProxies[0].provisional) { CODE_PROBE(true, "Get tenant info use tenant in disabled tenant mode", probe::decoration::rare); throw tenants_disabled(); } else { CODE_PROBE(true, "Get tenant info provisional proxies"); ASSERT(!useProvisionalProxies); } } ASSERT(t.present() && (allowInvalidTenantId || t.get()->id() != TenantInfo::INVALID_TENANT)); return TenantInfo( (allowInvalidTenantId && !t.get()->ready().isReady()) ? TenantInfo::INVALID_TENANT : t.get()->id(), authToken); } // Returns the tenant used in this transaction. If the tenant is unset and raw access isn't specified, then the default // tenant from DatabaseContext is applied to this transaction (note: the default tenant is typically unset, but in // simulation could be something different). // // This function should not be called in the transaction constructor or in the setOption function to allow a user the // opportunity to set raw access. Optional> const& TransactionState::tenant() { hasTenant(ResolveDefaultTenant::True); return tenant_; } // Returns true if the tenant has been set, but does not cause default tenant resolution. This is useful in setOption // (where we do not want to call tenant()) if we want to enforce that an option not be set on a Tenant transaction (e.g. // for raw access). bool TransactionState::hasTenant(ResolveDefaultTenant resolveDefaultTenant) { if (!tenantSet && resolveDefaultTenant) { if (!options.rawAccess && cx->defaultTenant.present()) { tenant_ = makeReference(cx->lookupTenant(cx->defaultTenant.get()), cx->defaultTenant); } tenantSet = true; } return tenant_.present(); } ACTOR Future startTransaction(Reference trState) { wait(success(trState->readVersionFuture)); if (trState->hasTenant()) { wait(trState->tenant().get()->ready()); } return Void(); } Future TransactionState::startTransaction(uint32_t readVersionFlags) { if (!startFuture.isValid()) { if (!readVersionFuture.isValid()) { readVersionFuture = getReadVersion(readVersionFlags); } if (readVersionFuture.isReady() && (!hasTenant() || tenant().get()->ready().isReady())) { startFuture = Void(); } else { startFuture = ::startTransaction(Reference::addRef(this)); } } return startFuture; } Future Transaction::warmRange(KeyRange keys) { return warmRange_impl(trState, keys); } ACTOR Future> getValue(Reference trState, Key key, UseTenant useTenant, TransactionRecordLogInfo recordLogInfo) { wait(trState->startTransaction()); CODE_PROBE(trState->hasTenant(), "NativeAPI getValue has tenant"); state Span span("NAPI:getValue"_loc, trState->spanContext); if (useTenant && trState->hasTenant()) { span.addAttribute("tenant"_sr, trState->tenant().get()->name.castTo().orDefault(""_sr)); } trState->cx->validateVersion(trState->readVersion()); loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(trState, key, &StorageServerInterface::getValue, Reverse::False, useTenant)); state Optional getValueID = Optional(); state uint64_t startTime; state double startTimeD; state VersionVector ssLatestCommitVersions; state Optional readOptions = trState->readOptions; trState->cx->getLatestCommitVersions(locationInfo.locations, trState, ssLatestCommitVersions); try { if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { getValueID = nondeterministicRandom()->randomUniqueID(); readOptions.get().debugID = getValueID; g_traceBatch.addAttach( "GetValueAttachID", trState->readOptions.get().debugID.get().first(), getValueID.get().first()); g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueInfo", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("Servers", describe(ssi.second->get()));*/ } ++trState->cx->getValueSubmitted; startTime = timer_int(); startTimeD = now(); ++trState->cx->transactionPhysicalReads; state GetValueReply reply; try { if (CLIENT_BUGGIFY_WITH_PROB(.01)) { throw deterministicRandom()->randomChoice( std::vector{ transaction_too_old(), future_version() }); } choose { when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetValueReply _reply = wait( loadBalance(trState->cx.getPtr(), locationInfo.locations, &StorageServerInterface::getValue, GetValueRequest(span.context, useTenant ? trState->getTenantInfo() : TenantInfo(), key, trState->readVersion(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), readOptions, ssLatestCommitVersions), TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr, trState->options.enableReplicaConsistencyCheck, trState->options.requiredReplicas))) { reply = _reply; } } ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } double latency = now() - startTimeD; trState->cx->readLatencies.addSample(latency); if (trState->trLogInfo && recordLogInfo) { int valueSize = reply.value.present() ? reply.value.get().size() : 0; trState->trLogInfo->addLog(FdbClientLogEvents::EventGet(startTimeD, trState->cx->clientLocality.dcId(), latency, valueSize, key, trState->tenant().flatMapRef(&Tenant::name))); } trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); trState->totalCost += getReadOperationCost(key.size() + (reply.value.present() ? reply.value.get().size() : 0)); if (getValueID.present()) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.After"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("ReplySize", reply.value.present() ? reply.value.get().size() : -1);*/ } trState->cx->transactionBytesRead += reply.value.present() ? reply.value.get().size() : 0; ++trState->cx->transactionKeysRead; return reply.value; } catch (Error& e) { trState->cx->getValueCompleted->latency = timer_int() - startTime; trState->cx->getValueCompleted->log(); if (getValueID.present()) { g_traceBatch.addEvent("GetValueDebug", getValueID.get().first(), "NativeAPI.getValue.Error"); //.detail("TaskID", g_network->getCurrentTask()); /*TraceEvent("TransactionDebugGetValueDone", getValueID.get()) .detail("Key", key) .detail("ReqVersion", ver) .detail("ReplySize", reply.value.present() ? reply.value.get().size() : -1);*/ } if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { trState->cx->invalidateCache(useTenant ? trState->tenant().mapRef(&Tenant::prefix) : Optional(), key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else { if (trState->trLogInfo && recordLogInfo) trState->trLogInfo->addLog( FdbClientLogEvents::EventGetError(startTimeD, trState->cx->clientLocality.dcId(), static_cast(e.code()), key, trState->tenant().flatMapRef(&Tenant::name))); throw e; } } } } ACTOR Future getKey(Reference trState, KeySelector k, UseTenant useTenant = UseTenant::True) { CODE_PROBE(!useTenant, "Get key ignoring tenant"); wait(trState->startTransaction()); CODE_PROBE(trState->hasTenant(), "NativeAPI getKey has tenant"); state Optional getKeyID; state Optional readOptions = trState->readOptions; state Span span("NAPI:getKey"_loc, trState->spanContext); if (trState->readOptions.present() && trState->readOptions.get().debugID.present()) { getKeyID = nondeterministicRandom()->randomUniqueID(); readOptions.get().debugID = getKeyID; g_traceBatch.addAttach( "GetKeyAttachID", trState->readOptions.get().debugID.get().first(), getKeyID.get().first()); g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.AfterVersion"); //.detail("StartKey", // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); } loop { if (k.getKey() == allKeys.end) { if (k.offset > 0) { return allKeys.end; } k.orEqual = false; } else if (k.getKey() == allKeys.begin && k.offset <= 0) { return Key(); } Key locationKey(k.getKey(), k.arena()); state KeyRangeLocationInfo locationInfo = wait(getKeyLocation( trState, locationKey, &StorageServerInterface::getKey, Reverse{ k.isBackward() }, useTenant)); state VersionVector ssLatestCommitVersions; trState->cx->getLatestCommitVersions(locationInfo.locations, trState, ssLatestCommitVersions); try { if (getKeyID.present()) g_traceBatch.addEvent( "GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.Before"); //.detail("StartKey", // k.getKey()).detail("Offset",k.offset).detail("OrEqual",k.orEqual); ++trState->cx->transactionPhysicalReads; GetKeyRequest req(span.context, useTenant ? trState->getTenantInfo() : TenantInfo(), k, trState->readVersion(), trState->cx->sampleReadTags() ? trState->options.readTags : Optional(), readOptions, ssLatestCommitVersions); req.arena.dependsOn(k.arena()); state GetKeyReply reply; try { choose { when(wait(trState->cx->connectionFileChanged())) { throw transaction_too_old(); } when(GetKeyReply _reply = wait( loadBalance(trState->cx.getPtr(), locationInfo.locations, &StorageServerInterface::getKey, req, TaskPriority::DefaultPromiseEndpoint, AtMostOnce::False, trState->cx->enableLocalityLoadBalance ? &trState->cx->queueModel : nullptr, trState->options.enableReplicaConsistencyCheck, trState->options.requiredReplicas))) { reply = _reply; } } ++trState->cx->transactionPhysicalReadsCompleted; } catch (Error&) { ++trState->cx->transactionPhysicalReadsCompleted; throw; } if (getKeyID.present()) g_traceBatch.addEvent("GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.After"); //.detail("NextKey",reply.sel.key).detail("Offset", // reply.sel.offset).detail("OrEqual", k.orEqual); k = reply.sel; if (!k.offset && k.orEqual) { return k.getKey(); } } catch (Error& e) { if (getKeyID.present()) g_traceBatch.addEvent("GetKeyDebug", getKeyID.get().first(), "NativeAPI.getKey.Error"); if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { trState->cx->invalidateCache(useTenant ? trState->tenant().mapRef(&Tenant::prefix) : Optional(), k.getKey(), Reverse{ k.isBackward() }); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, trState->taskID)); } else { TraceEvent(SevInfo, "GetKeyError").error(e).detail("AtKey", k.getKey()).detail("Offset", k.offset); throw e; } } } } ACTOR Future waitForCommittedVersion(Database cx, Version version, SpanContext spanContext) { state Span span("NAPI:waitForCommittedVersion"_loc, spanContext); loop { try { choose { when(wait(cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance( cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getConsistentReadVersion, GetReadVersionRequest( span.context, 0, TransactionPriority::IMMEDIATE, cx->ssVersionVectorCache.getMaxVersion()), cx->taskID))) { cx->minAcceptableReadVersion = std::min(cx->minAcceptableReadVersion, v.version); if (v.midShardSize > 0) cx->smoothMidShardSize.setTotal(v.midShardSize); if (cx->versionVectorCacheActive(v.ssVersionVectorDelta)) { if (cx->isCurrentGrvProxy(v.proxyId)) { cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta); } else { cx->ssVersionVectorCache.clear(); } } if (v.version >= version) return v.version; // SOMEDAY: Do the wait on the server side, possibly use less expensive source of committed version // (causal consistency is not needed for this purpose) wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, cx->taskID)); } } } catch (Error& e) { if (e.code() == error_code_batch_transaction_throttled || e.code() == error_code_grv_proxy_memory_limit_exceeded) { // GRV Proxy returns an error wait(delayJittered(CLIENT_KNOBS->GRV_ERROR_RETRY_DELAY)); } else { TraceEvent(SevError, "WaitForCommittedVersionError").error(e); throw; } } } } ACTOR Future getRawVersion(Reference trState) { state Span span("NAPI:getRawVersion"_loc, trState->spanContext); loop { choose { when(wait(trState->cx->onProxiesChanged())) {} when(GetReadVersionReply v = wait(basicLoadBalance(trState->cx->getGrvProxies(UseProvisionalProxies::False), &GrvProxyInterface::getConsistentReadVersion, GetReadVersionRequest(trState->spanContext, 0, TransactionPriority::IMMEDIATE, trState->cx->ssVersionVectorCache.getMaxVersion()), trState->cx->taskID))) { if (trState->cx->versionVectorCacheActive(v.ssVersionVectorDelta)) { if (trState->cx->isCurrentGrvProxy(v.proxyId)) { trState->cx->ssVersionVectorCache.applyDelta(v.ssVersionVectorDelta); } else { trState->cx->ssVersionVectorCache.clear(); } } return v.version; } } } } ACTOR Future readVersionBatcher( DatabaseContext* cx, FutureStream, Optional>> versionStream, uint32_t flags); ACTOR Future watchValue(Database cx, Reference parameters) { state Span span("NAPI:watchValue"_loc, parameters->spanContext); state Version ver = parameters->version; cx->validateVersion(parameters->version); ASSERT(parameters->version != latestVersion); CODE_PROBE(parameters->tenant.hasTenant(), "NativeAPI watchValue has tenant"); loop { state KeyRangeLocationInfo locationInfo = wait(getKeyLocation(cx, parameters->tenant, parameters->key, &StorageServerInterface::watchValue, parameters->spanContext, parameters->debugID, parameters->useProvisionalProxies, Reverse::False, parameters->version)); try { state Optional watchValueID = Optional(); if (parameters->debugID.present()) { watchValueID = nondeterministicRandom()->randomUniqueID(); g_traceBatch.addAttach( "WatchValueAttachID", parameters->debugID.get().first(), watchValueID.get().first()); g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Before"); //.detail("TaskID", g_network->getCurrentTask()); } state WatchValueReply resp; choose { when(WatchValueReply r = wait( loadBalance(cx.getPtr(), locationInfo.locations, &StorageServerInterface::watchValue, WatchValueRequest(span.context, parameters->tenant, parameters->key, parameters->value, ver, cx->sampleReadTags() ? parameters->tags : Optional(), watchValueID), TaskPriority::DefaultPromiseEndpoint))) { resp = r; } when(wait(cx->connectionRecord ? cx->connectionRecord->onChange() : Never())) { wait(Never()); } } if (watchValueID.present()) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.After"); } // FIXME: wait for known committed version on the storage server before replying, // cannot do this until the storage server is notified on knownCommittedVersion changes from tlog (faster // than the current update loop) Version v = wait(waitForCommittedVersion(cx, resp.version, span.context)); // False if there is a master failure between getting the response // and getting the committed version, Dependent on // SERVER_KNOBS->MAX_VERSIONS_IN_FLIGHT. Set to around half of the // max versions in flight in an attempt to reliably recognize when // a recovery has occurred, but avoid triggering if it just takes a // little while to get the committed version. bool buggifyRetry = g_network->isSimulated() && !g_simulator->speedUpSimulation && BUGGIFY_WITH_PROB(0.1); CODE_PROBE(buggifyRetry, "Watch buggifying version gap retry"); if (v - resp.version < 50'000'000 && !buggifyRetry) { return resp.version; } ver = v; if (watchValueID.present()) { g_traceBatch.addEvent("WatchValueDebug", watchValueID.get().first(), "NativeAPI.watchValue.Retry"); } } catch (Error& e) { if (e.code() == error_code_wrong_shard_server || e.code() == error_code_all_alternatives_failed) { cx->invalidateCache(parameters->tenant.prefix, parameters->key); wait(delay(CLIENT_KNOBS->WRONG_SHARD_SERVER_DELAY, parameters->taskID)); } else if (e.code() == error_code_watch_cancelled || e.code() == error_code_process_behind) { // clang-format off CODE_PROBE(e.code() == error_code_watch_cancelled, "Too many watches on the storage server, poll for changes instead"); CODE_PROBE(e.code() == error_code_process_behind, "The storage servers are all behind", probe::decoration::rare); // clang-format on wait(delay(CLIENT_KNOBS->WATCH_POLLING_TIME, parameters->taskID)); } else if (e.code() == error_code_timed_out) { // The storage server occasionally times out watches in case // it was cancelled CODE_PROBE(true, "A watch timed out"); wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID)); } else { state Error err = e; wait(delay(CLIENT_KNOBS->FUTURE_VERSION_RETRY_DELAY, parameters->taskID)); throw err; } } } } ACTOR Future watchStorageServerResp(int64_t tenantId, Key key, Database cx) { loop { try { state Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); Version watchVersion = wait(watchValue(cx, metadata->parameters)); metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) return Void(); // case 1: version_1 (SS) >= version_2 (map) if (watchVersion >= metadata->parameters->version) { cx->deleteWatchMetadata(tenantId, key); if (metadata->watchPromise.canBeSet()) metadata->watchPromise.send(watchVersion); } // ABA happens else { CODE_PROBE(true, "ABA issue where the version returned from the server is less than the version in the map"); // case 2: version_1 < version_2 and future_count == 1 if (metadata->watchPromise.getFutureReferenceCount() == 1) { cx->deleteWatchMetadata(tenantId, key); } } } catch (Error& e) { if (e.code() == error_code_operation_cancelled) { throw e; } Reference metadata = cx->getWatchMetadata(tenantId, key); if (!metadata.isValid()) { return Void(); } else if (metadata->watchPromise.getFutureReferenceCount() == 1) { cx->deleteWatchMetadata(tenantId, key); return Void(); } else if (e.code() == error_code_future_version) { continue; } cx->deleteWatchMetadata(tenantId, key); metadata->watchPromise.sendError(e); throw e; } } } ACTOR Future sameVersionDiffValue(Database cx, Reference parameters) { state ReadYourWritesTransaction tr(cx, parameters->tenant.hasTenant() ? makeReference(parameters->tenant.tenantId) : Optional>()); loop { try { if (!parameters->tenant.hasTenant()) { tr.setOption(FDBTransactionOptions::READ_SYSTEM_KEYS); } state Optional valSS = wait(tr.get(parameters->key)); Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // val_3 != val_1 (storage server value doesn't match value in map) if (metadata.isValid() && valSS != metadata->parameters->value) { cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); } // val_3 == val_2 (storage server value matches value passed into the function -> new watch) if (valSS == parameters->value && tr.getTransactionState()->tenantId() == parameters->tenant.tenantId) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); } // if val_3 != val_2 if (valSS != parameters->value) return Void(); // val_3 == val_2 wait(success(metadata->watchPromise.getFuture())); return Void(); } catch (Error& e) { wait(tr.onError(e)); } } } Future getWatchFuture(Database cx, Reference parameters) { Reference metadata = cx->getWatchMetadata(parameters->tenant.tenantId, parameters->key); // case 1: key not in map if (!metadata.isValid()) { metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 2: val_1 == val_2 (received watch with same value as key already in the map so just update) else if (metadata->parameters->value == parameters->value) { if (parameters->version > metadata->parameters->version) { metadata->parameters = parameters; } return success(metadata->watchPromise.getFuture()); } // case 3: val_1 != val_2 && version_2 > version_1 (received watch with different value and a higher version so // recreate in SS) else if (parameters->version > metadata->parameters->version) { CODE_PROBE(true, "Setting a watch that has a different value than the one in the map but a higher version (newer)"); cx->deleteWatchMetadata(parameters->tenant.tenantId, parameters->key); metadata->watchPromise.send(parameters->version); metadata->watchFutureSS.cancel(); metadata = makeReference(parameters); cx->setWatchMetadata(metadata); metadata->watchFutureSS = watchStorageServerResp(parameters->tenant.tenantId, parameters->key, cx); return success(metadata->watchPromise.getFuture()); } // case 5: val_1 != val_2 && version_1 == version_2 (received watch with different value but same version) else if (metadata->parameters->version == parameters->version) { CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but the same version"); return sameVersionDiffValue(cx, parameters); } CODE_PROBE(true, "Setting a watch which has a different value than the one in the map but a lower version (older)"); // case 4: val_1 != val_2 && version_2 < version_1 return Void(); } namespace { // NOTE: Since an ACTOR could receive multiple exceptions for a single catch clause, e.g. broken promise together with // operation cancelled, If the decreaseWatchRefCount is placed at the catch clause, it might be triggered for multiple // times. One could check if the SAV isSet, but seems a more intuitive way is to use RAII-style constructor/destructor // pair. Yet the object has to be constructed after a wait statement, so it must be trivially-constructible. This // requires move-assignment operator implemented. class WatchRefCountUpdater { Database cx; int64_t tenantID; KeyRef key; Version version; public: WatchRefCountUpdater() = default; WatchRefCountUpdater(const Database& cx_, const int64_t tenantID_, KeyRef key_, const Version& ver) : cx(cx_), tenantID(tenantID_), key(key_), version(ver) {} WatchRefCountUpdater& operator=(WatchRefCountUpdater&& other) { if (cx.getReference()) { cx->decreaseWatchRefCount(tenantID, key, version); } cx = std::move(other.cx); tenantID = std::move(other.tenantID); key = std::move(other.key); version = std::move(other.version); cx->increaseWatchRefCount(tenantID, key, version); return *this; } ~WatchRefCountUpdater() { if (cx.getReference()) { cx->decreaseWatchRefCount(tenantID, key, version); } } }; } // namespace ACTOR Future watchValueMap(Future version, TenantInfo tenant, Key key, Optional value, Database cx, TagSet tags, SpanContext spanContext, TaskPriority taskID, Optional debugID, UseProvisionalProxies useProvisionalProxies) { state Version ver = wait(version); state WatchRefCountUpdater watchRefCountUpdater(cx, tenant.tenantId, key, ver); wait(getWatchFuture(cx, makeReference( tenant, key, value, ver, tags, spanContext, taskID, debugID, useProvisionalProxies))); return Void(); } template void transformRangeLimits(GetRangeLimits limits, Reverse reverse, GetKeyValuesFamilyRequest& req) { if (limits.bytes != 0) { if (!limits.hasRowLimit()) req.limit = CLIENT_KNOBS->REPLY_BYTE_LIMIT; // Can't get more than this many rows anyway else req.limit = std::min(CLIENT_KNOBS->REPLY_BYTE_LIMIT, limits.rows); if (reverse) req.limit *= -1; if (!limits.hasByteLimit()) req.limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT; else req.limitBytes = std::min(CLIENT_KNOBS->REPLY_BYTE_LIMIT, limits.bytes); } else { req.limitBytes = CLIENT_KNOBS->REPLY_BYTE_LIMIT; req.limit = reverse ? -limits.minRows : limits.minRows; } } template PublicRequestStream StorageServerInterface::*getRangeRequestStream() { if constexpr (std::is_same::value) { return &StorageServerInterface::getKeyValues; } else if (std::is_same::value) { return &StorageServerInterface::getMappedKeyValues; } else { UNREACHABLE(); } } ACTOR template Future getExactRange(Reference trState, KeyRange keys, Key mapper, GetRangeLimits limits, Reverse reverse, UseTenant useTenant) { state RangeResultFamily output; // TODO - ljoswiak parent or link? state Span span("NAPI:getExactRange"_loc, trState->spanContext); CODE_PROBE(trState->hasTenant() && useTenant, "NativeAPI getExactRange has tenant"); CODE_PROBE(!useTenant, "NativeAPI getExactRange ignoring tenant"); if (useTenant && trState->hasTenant()) { span.addAttribute("tenant"_sr, trState->tenant().get()->name.castTo().orDefault("