feat(s3): improve observability for S3 operations and L0 retention (#996)

Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Cory LaNou
2026-01-20 07:33:35 -06:00
committed by GitHub
parent 4a3bff1857
commit c582db181c
3 changed files with 76 additions and 5 deletions

38
db.go
View File

@@ -1893,7 +1893,9 @@ func (db *DB) EnforceL0RetentionByTime(ctx context.Context) error {
return nil
}
db.Logger.Debug("enforcing l0 retention", "retention", db.L0Retention)
db.Logger.Debug("starting l0 retention enforcement", "retention", db.L0Retention)
dbName := filepath.Base(db.Path())
// Determine the highest TXID that has been compacted into L1.
itr, err := db.Replica.Client.LTXFiles(ctx, 1, 0, false)
@@ -1911,6 +1913,9 @@ func (db *DB) EnforceL0RetentionByTime(ctx context.Context) error {
return fmt.Errorf("close l1 iterator: %w", err)
}
if maxL1TXID == 0 {
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "eligible").Set(0)
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "not_compacted").Set(0)
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "too_recent").Set(0)
return nil
}
@@ -1922,13 +1927,17 @@ func (db *DB) EnforceL0RetentionByTime(ctx context.Context) error {
defer itr.Close()
var (
deleted []*ltx.FileInfo
lastInfo *ltx.FileInfo
processedAll = true
deleted []*ltx.FileInfo
lastInfo *ltx.FileInfo
processedAll = true
totalFiles int
notCompactedCount int
tooRecentCount int
)
for itr.Next() {
info := itr.Item()
lastInfo = info
totalFiles++
createdAt := info.CreatedAt
if createdAt.IsZero() {
@@ -1943,11 +1952,21 @@ func (db *DB) EnforceL0RetentionByTime(ctx context.Context) error {
// L0 entries are ordered; once we reach a newer file we stop so we don't
// create gaps between retained files. VFS expects contiguous coverage.
processedAll = false
tooRecentCount++
break
}
if info.MaxTXID <= maxL1TXID {
deleted = append(deleted, info)
} else {
notCompactedCount++
}
}
// Count remaining files as too_recent if we stopped early
if !processedAll {
for itr.Next() {
tooRecentCount++
}
}
@@ -1956,6 +1975,17 @@ func (db *DB) EnforceL0RetentionByTime(ctx context.Context) error {
deleted = deleted[:len(deleted)-1]
}
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "eligible").Set(float64(len(deleted)))
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "not_compacted").Set(float64(notCompactedCount))
internal.L0RetentionGaugeVec.WithLabelValues(dbName, "too_recent").Set(float64(tooRecentCount))
db.Logger.Debug("l0 retention scan complete",
"total_l0_files", totalFiles,
"eligible_for_deletion", len(deleted),
"not_compacted_yet", notCompactedCount,
"too_recent", tooRecentCount,
"max_l1_txid", maxL1TXID)
if len(deleted) == 0 {
return nil
}

View File

@@ -148,4 +148,20 @@ var (
Name: "litestream_replica_operation_bytes",
Help: "The number of bytes used by replica operations",
}, []string{"replica_type", "operation"})
OperationDurationHistogramVec = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "litestream_replica_operation_duration_seconds",
Help: "Duration of replica operations by type and operation",
Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 60},
}, []string{"replica_type", "operation"})
OperationErrorCounterVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "litestream_replica_operation_errors_total",
Help: "Number of replica operation errors by type, operation, and error code",
}, []string{"replica_type", "operation", "code"})
L0RetentionGaugeVec = promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "litestream_l0_retention_files_total",
Help: "Number of L0 files by status during retention enforcement",
}, []string{"db", "status"})
)

View File

@@ -978,10 +978,14 @@ func (c *ReplicaClient) DeleteLTXFiles(ctx context.Context, a []*ltx.FileInfo) e
c.logger.Debug("deleting ltx files batch", "count", n)
start := time.Now()
out, err := c.s3.DeleteObjects(ctx, &s3.DeleteObjectsInput{
Bucket: aws.String(c.Bucket),
Delete: &types.Delete{Objects: objIDs[:n]},
})
duration := time.Since(start)
internal.OperationDurationHistogramVec.WithLabelValues(ReplicaClientType, "DELETE").Observe(duration.Seconds())
if err != nil {
return fmt.Errorf("s3: delete batch of %d objects: %w", n, err)
}
@@ -990,9 +994,30 @@ func (c *ReplicaClient) DeleteLTXFiles(ctx context.Context, a []*ltx.FileInfo) e
if out != nil {
deleted = len(out.Deleted)
}
c.logger.Debug("delete batch completed", "requested", n, "deleted", deleted, "errors", len(out.Errors))
c.logger.Debug("delete batch completed",
"requested", n,
"deleted", deleted,
"errors", len(out.Errors),
"duration_ms", duration.Milliseconds())
internal.OperationTotalCounterVec.WithLabelValues(ReplicaClientType, "DELETE").Add(float64(deleted))
if len(out.Errors) > 0 {
for i, e := range out.Errors {
code := aws.ToString(e.Code)
internal.OperationErrorCounterVec.WithLabelValues(ReplicaClientType, "DELETE", code).Inc()
if i < 5 {
c.logger.Warn("delete object failed",
"key", aws.ToString(e.Key),
"code", code,
"message", aws.ToString(e.Message))
}
}
if len(out.Errors) > 5 {
c.logger.Warn("additional delete errors suppressed", "count", len(out.Errors)-5)
}
}
if err := deleteOutputError(out); err != nil {
return err
}