feat: Add optional post-compaction consistency verification (#1029)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Ben Johnson
2026-01-21 14:25:23 -07:00
committed by GitHub
parent c582db181c
commit b474098624
6 changed files with 207 additions and 0 deletions

View File

@@ -231,6 +231,10 @@ type Config struct {
L0Retention *time.Duration `yaml:"l0-retention"`
L0RetentionCheckInterval *time.Duration `yaml:"l0-retention-check-interval"`
// Verify TXID consistency at destination level after each compaction.
// When enabled, logs warnings if gaps or overlaps are detected.
VerifyCompaction bool `yaml:"verify-compaction"`
// Heartbeat settings (global defaults)
HeartbeatURL string `yaml:"heartbeat-url"`
HeartbeatInterval *time.Duration `yaml:"heartbeat-interval"`

View File

@@ -250,6 +250,9 @@ func (c *ReplicateCommand) Run(ctx context.Context) (err error) {
if c.Config.ShutdownSyncInterval != nil {
c.Store.SetShutdownSyncInterval(*c.Config.ShutdownSyncInterval)
}
if c.Config.VerifyCompaction {
c.Store.SetVerifyCompaction(true)
}
if c.Config.HeartbeatURL != "" {
interval := litestream.DefaultHeartbeatInterval
if c.Config.HeartbeatInterval != nil {

View File

@@ -8,6 +8,7 @@ import (
"os"
"time"
"github.com/prometheus/client_golang/prometheus"
"github.com/superfly/ltx"
)
@@ -18,6 +19,15 @@ type Compactor struct {
client ReplicaClient
logger *slog.Logger
// VerifyCompaction enables post-compaction TXID consistency verification.
// When enabled, verifies that files at the destination level have
// contiguous TXID ranges after each compaction. Disabled by default.
VerifyCompaction bool
// CompactionVerifyErrorCounter is incremented when post-compaction
// verification fails. Optional; if nil, no metric is recorded.
CompactionVerifyErrorCounter prometheus.Counter
// LocalFileOpener optionally opens a local LTX file for compaction.
// If nil or returns os.ErrNotExist, falls back to remote.
// This is used by DB to prefer local files over remote for consistency.
@@ -164,9 +174,62 @@ func (c *Compactor) Compact(ctx context.Context, dstLevel int) (*ltx.FileInfo, e
c.CacheSetter(dstLevel, info)
}
// Verify level consistency if enabled
if c.VerifyCompaction {
if err := c.VerifyLevelConsistency(ctx, dstLevel); err != nil {
c.logger.Warn("post-compaction verification failed",
"level", dstLevel,
"error", err)
if c.CompactionVerifyErrorCounter != nil {
c.CompactionVerifyErrorCounter.Inc()
}
}
}
return info, nil
}
// VerifyLevelConsistency checks that LTX files at the given level have
// contiguous TXID ranges (prevMaxTXID + 1 == currMinTXID for consecutive files).
// Returns an error describing any gaps or overlaps found.
func (c *Compactor) VerifyLevelConsistency(ctx context.Context, level int) error {
itr, err := c.client.LTXFiles(ctx, level, 0, false)
if err != nil {
return fmt.Errorf("fetch ltx files: %w", err)
}
defer itr.Close()
var prevInfo *ltx.FileInfo
for itr.Next() {
info := itr.Item()
// Skip first file - nothing to compare against
if prevInfo == nil {
prevInfo = info
continue
}
// Check for TXID contiguity: prev.MaxTXID + 1 should equal curr.MinTXID
expectedMinTXID := prevInfo.MaxTXID + 1
if info.MinTXID != expectedMinTXID {
if info.MinTXID > expectedMinTXID {
return fmt.Errorf("TXID gap detected: prev.MaxTXID=%s, next.MinTXID=%s (expected %s)",
prevInfo.MaxTXID, info.MinTXID, expectedMinTXID)
}
return fmt.Errorf("TXID overlap detected: prev.MaxTXID=%s, next.MinTXID=%s",
prevInfo.MaxTXID, info.MinTXID)
}
prevInfo = info
}
if err := itr.Close(); err != nil {
return fmt.Errorf("close iterator: %w", err)
}
return nil
}
// EnforceSnapshotRetention enforces retention of snapshot level files by timestamp.
// Files older than the retention duration are deleted (except the newest is always kept).
// Returns the minimum snapshot TXID still retained (useful for cascading retention to lower levels).

View File

@@ -321,6 +321,112 @@ func TestCompactor_EnforceSnapshotRetention(t *testing.T) {
})
}
func TestCompactor_VerifyLevelConsistency(t *testing.T) {
t.Run("ContiguousFiles", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
// Create contiguous files
createTestLTXFile(t, client, 1, 1, 2)
createTestLTXFile(t, client, 1, 3, 5)
createTestLTXFile(t, client, 1, 6, 10)
// Should pass verification
err := compactor.VerifyLevelConsistency(context.Background(), 1)
if err != nil {
t.Errorf("expected nil error for contiguous files, got: %v", err)
}
})
t.Run("GapDetected", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
// Create files with a gap (missing TXID 3-4)
createTestLTXFile(t, client, 1, 1, 2)
createTestLTXFile(t, client, 1, 5, 7) // gap: expected MinTXID=3, got 5
err := compactor.VerifyLevelConsistency(context.Background(), 1)
if err == nil {
t.Error("expected error for gap in files, got nil")
}
if err != nil && !containsString(err.Error(), "gap") {
t.Errorf("expected gap error, got: %v", err)
}
})
t.Run("OverlapDetected", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
// Create overlapping files
createTestLTXFile(t, client, 1, 1, 5)
createTestLTXFile(t, client, 1, 3, 7) // overlap: expected MinTXID=6, got 3
err := compactor.VerifyLevelConsistency(context.Background(), 1)
if err == nil {
t.Error("expected error for overlapping files, got nil")
}
if err != nil && !containsString(err.Error(), "overlap") {
t.Errorf("expected overlap error, got: %v", err)
}
})
t.Run("SingleFile", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
// Create single file - should pass
createTestLTXFile(t, client, 1, 1, 5)
err := compactor.VerifyLevelConsistency(context.Background(), 1)
if err != nil {
t.Errorf("expected nil error for single file, got: %v", err)
}
})
t.Run("EmptyLevel", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
// Empty level - should pass
err := compactor.VerifyLevelConsistency(context.Background(), 1)
if err != nil {
t.Errorf("expected nil error for empty level, got: %v", err)
}
})
}
func TestCompactor_CompactWithVerification(t *testing.T) {
t.Run("VerificationEnabled", func(t *testing.T) {
client := file.NewReplicaClient(t.TempDir())
compactor := litestream.NewCompactor(client, slog.Default())
compactor.VerifyCompaction = true
// Create contiguous L0 files
createTestLTXFile(t, client, 0, 1, 1)
createTestLTXFile(t, client, 0, 2, 2)
createTestLTXFile(t, client, 0, 3, 3)
// Compact to L1 - should succeed with verification
info, err := compactor.Compact(context.Background(), 1)
if err != nil {
t.Fatal(err)
}
if info.Level != 1 {
t.Errorf("Level=%d, want 1", info.Level)
}
if info.MinTXID != 1 || info.MaxTXID != 3 {
t.Errorf("TXID range=%d-%d, want 1-3", info.MinTXID, info.MaxTXID)
}
})
}
// containsString checks if s contains substr.
func containsString(s, substr string) bool {
return bytes.Contains([]byte(s), []byte(substr))
}
// createTestLTXFile creates a minimal LTX file for testing.
func createTestLTXFile(t testing.TB, client litestream.ReplicaClient, level int, minTXID, maxTXID ltx.TXID) {
t.Helper()

14
db.go
View File

@@ -148,6 +148,11 @@ type DB struct {
// Minimum time to retain L0 files after they have been compacted into L1.
L0Retention time.Duration
// VerifyCompaction enables post-compaction TXID consistency verification.
// When enabled, verifies that files at the destination level have
// contiguous TXID ranges after each compaction.
VerifyCompaction bool
// Remote replica for the database.
// Must be set before calling Open().
Replica *Replica
@@ -209,6 +214,7 @@ func NewDB(path string) *DB {
db.compactor = NewCompactor(nil, db.Logger)
db.compactor.LocalFileOpener = db.openLocalLTXFile
db.compactor.LocalFileDeleter = db.deleteLocalLTXFile
db.compactor.CompactionVerifyErrorCounter = compactionVerifyErrorCounterVec.WithLabelValues(db.path)
db.compactor.CacheGetter = func(level int) (*ltx.FileInfo, bool) {
db.maxLTXFileInfos.Lock()
defer db.maxLTXFileInfos.Unlock()
@@ -433,6 +439,9 @@ func (db *DB) Open() (err error) {
db.opened = true
db.mu.Unlock()
// Apply verify compaction setting to the compactor
db.compactor.VerifyCompaction = db.VerifyCompaction
return nil
}
@@ -2275,4 +2284,9 @@ var (
Name: "litestream_checkpoint_seconds",
Help: "Time spent checkpointing WAL, in seconds",
}, []string{"db", "mode"})
compactionVerifyErrorCounterVec = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "litestream_compaction_verify_error_count",
Help: "Number of post-compaction verification failures",
}, []string{"db"})
)

View File

@@ -80,6 +80,9 @@ type Store struct {
// If true, compaction is run in the background according to compaction levels.
CompactionMonitorEnabled bool
// If true, verify TXID consistency at destination level after each compaction.
VerifyCompaction bool
// Shutdown sync retry settings.
ShutdownSyncTimeout time.Duration
ShutdownSyncInterval time.Duration
@@ -114,6 +117,7 @@ func NewStore(dbs []*DB, levels CompactionLevels) *Store {
db.L0Retention = s.L0Retention
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
db.ShutdownSyncInterval = s.ShutdownSyncInterval
db.VerifyCompaction = s.VerifyCompaction
}
s.ctx, s.cancel = context.WithCancel(context.Background())
return s
@@ -212,6 +216,7 @@ func (s *Store) AddDB(db *DB) error {
db.L0Retention = s.L0Retention
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
db.ShutdownSyncInterval = s.ShutdownSyncInterval
db.VerifyCompaction = s.VerifyCompaction
// Open the database without holding the lock to avoid blocking other operations.
// The double-check pattern below handles the race condition.
@@ -366,6 +371,18 @@ func (s *Store) SetShutdownSyncInterval(d time.Duration) {
}
}
// SetVerifyCompaction updates the verify compaction flag and propagates it to
// all managed databases.
func (s *Store) SetVerifyCompaction(v bool) {
s.mu.Lock()
defer s.mu.Unlock()
s.VerifyCompaction = v
for _, db := range s.dbs {
db.VerifyCompaction = v
db.compactor.VerifyCompaction = v
}
}
// SnapshotLevel returns a pseudo compaction level based on snapshot settings.
func (s *Store) SnapshotLevel() *CompactionLevel {
return &CompactionLevel{