HealthMetricsApi workload: only check when we've received metrics (#12636)

rdar://166184432

Sometimes we don't get enough data to compute all of the stats that this workload wants to see as non-zero.
Maintain a flag gotMetrics and do checks on metrics only if this flag is set.

Debugged by the obvious method of adding TraceEvents to see what was happening with this workload.

Also some minor TraceEvent updates and simplify a variable name.

20260113-232530-gglass-05eb3d48db99757b compressed=True data_size=35600940 duration=3977581 ended=100000 fail_fast=1000 max_runs=100000 pass=100000 priority=100 remaining=0 runtime=8:02:14 sanity=False started=100000 stopped=20260114-072744 submitted=20260113-232530 timeout=5400 username=gglass

* HealthMetricsApi workload: only check() aggressively when we have received full metrics

* HealthMetricsApi.actor.cpp: address review comments, and add a overall comment saying that this seems testable outside simulation
This commit is contained in:
gxglass
2026-01-14 14:55:13 -08:00
committed by GitHub
parent 20a5722935
commit b1848af90a

View File

@@ -23,7 +23,15 @@
#include "fdbserver/WorkerInterface.actor.h"
#include "flow/actorcompiler.h" // This must be the last #include.
// workload description
// NOTE: it might be simpler to test health metrics via something
// other than simulation. Testing equivalent to what this workload does can
// seemingly be obtained by a straight line test case that does
// the following:
// a) start a cluster
// b) do a few transactions
// c) call getHealthMetrics()
// d) ensure the returned metrics are non-zero.
// This workload can be attached to other workload to collect health information about the FDB cluster.
struct HealthMetricsApiWorkload : TestWorkload {
// Performance Metrics
@@ -46,6 +54,7 @@ struct HealthMetricsApiWorkload : TestWorkload {
// internal states
bool healthMetricsStoppedUpdating = false;
bool gotMetrics = false;
static constexpr auto NAME = "HealthMetricsApi";
HealthMetricsApiWorkload(WorkloadContext const& wcx) : TestWorkload(wcx) {
@@ -57,8 +66,9 @@ struct HealthMetricsApiWorkload : TestWorkload {
ACTOR static Future<Void> _setup(Database cx, HealthMetricsApiWorkload* self) {
if (!self->sendDetailedHealthMetrics) {
// Clear detailed health metrics that are already populated
wait(delay(2 * CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS));
// Internally cached health metrics time out after this knob. Wait
// an extra second to avoid any off-by-1 ">" vs ">=" type issues.
wait(delay(1 + CLIENT_KNOBS->DETAILED_HEALTH_METRICS_MAX_STALENESS));
cx->healthMetrics.storageStats.clear();
cx->healthMetrics.tLogQueue.clear();
}
@@ -72,23 +82,36 @@ struct HealthMetricsApiWorkload : TestWorkload {
Future<Void> start(Database const& cx) override { return _start(cx, this); }
Future<bool> check(Database const& cx) override {
if (!gotMetrics) {
// It's not valid to fail a sanity check of metrics which have never been received.
// Yes, this encodes a blatant "got" vs "have" usage error. The intent is to show
// up on any case insensitive search for "gotmetrics".
TraceEvent("HealthMetricsCheckPassedBecauseWeDontGotMetrics");
return true;
}
if (healthMetricsStoppedUpdating) {
TraceEvent(SevError, "HealthMetricsStoppedUpdating").log();
return false;
}
bool correctHealthMetricsState = true;
if (worstStorageQueue == 0 || worstStorageDurabilityLag == 0 || worstTLogQueue == 0)
correctHealthMetricsState = false;
bool valid = true;
if (worstStorageQueue == 0 || worstStorageDurabilityLag == 0 || worstTLogQueue == 0) {
valid = false;
TraceEvent("HealthMetrics:valid_false_case1");
}
if (sendDetailedHealthMetrics) {
if (detailedWorstStorageQueue == 0 || detailedWorstStorageDurabilityLag == 0 ||
detailedWorstTLogQueue == 0 || detailedWorstCpuUsage == 0.0 || detailedWorstDiskUsage == 0.0)
correctHealthMetricsState = false;
detailedWorstTLogQueue == 0 || detailedWorstCpuUsage == 0.0 || detailedWorstDiskUsage == 0.0) {
valid = false;
TraceEvent("HealthMetrics:valid_false_case2");
}
} else {
if (detailedWorstStorageQueue != 0 || detailedWorstStorageDurabilityLag != 0 ||
detailedWorstTLogQueue != 0 || detailedWorstCpuUsage != 0.0 || detailedWorstDiskUsage != 0.0)
correctHealthMetricsState = false;
detailedWorstTLogQueue != 0 || detailedWorstCpuUsage != 0.0 || detailedWorstDiskUsage != 0.0) {
valid = false;
TraceEvent("HealthMetrics:valid_false_case3");
}
}
if (!correctHealthMetricsState) {
if (!valid) {
TraceEvent(SevError, "IncorrectHealthMetricsState")
.detail("WorstStorageQueue", worstStorageQueue)
.detail("WorstLimitingStorageQueue", worstLimitingStorageQueue)
@@ -102,7 +125,7 @@ struct HealthMetricsApiWorkload : TestWorkload {
.detail("DetailedWorstDiskUsage", detailedWorstDiskUsage)
.detail("SendingDetailedHealthMetrics", sendDetailedHealthMetrics);
}
return correctHealthMetricsState;
return valid;
}
void getMetrics(std::vector<PerfMetric>& m) override {
@@ -151,8 +174,9 @@ struct HealthMetricsApiWorkload : TestWorkload {
TraceEvent traceCpuUsage("CpuUsage");
TraceEvent traceDiskUsage("DiskUsage");
// update metrics
bool gotStorageStats = false;
for (const auto& ss : healthMetrics.storageStats) {
gotStorageStats = true;
auto storageStats = ss.second;
self->detailedWorstStorageQueue = std::max(self->detailedWorstStorageQueue, storageStats.storageQueue);
traceStorageQueue.detail(format("Storage-%s", ss.first.toString().c_str()), storageStats.storageQueue);
@@ -167,10 +191,16 @@ struct HealthMetricsApiWorkload : TestWorkload {
}
TraceEvent traceTLogQueue("TLogQueue");
traceTLogQueue.setMaxEventLength(10000);
bool gotTLogQueue = false;
for (const auto& ss : healthMetrics.tLogQueue) {
gotTLogQueue = true;
self->detailedWorstTLogQueue = std::max(self->detailedWorstTLogQueue, ss.second);
traceTLogQueue.detail(format("TLog-%s", ss.first.toString().c_str()), ss.second);
}
if (!self->gotMetrics && gotStorageStats && gotTLogQueue) {
TraceEvent("HealthMetricsGotFullResult");
self->gotMetrics = true;
}
};
}
};