feat(db): add shutdown sync retry for rate limiting resilience (#904)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Cory LaNou
2025-12-29 15:05:32 -06:00
committed by GitHub
parent 3480067f4b
commit 2d38fa490b
7 changed files with 727 additions and 15 deletions

View File

@@ -49,6 +49,8 @@ var (
ErrInvalidSyncInterval = errors.New("sync interval must be greater than 0")
ErrInvalidL0Retention = errors.New("l0 retention must be greater than 0")
ErrInvalidL0RetentionCheckInterval = errors.New("l0 retention check interval must be greater than 0")
ErrInvalidShutdownSyncTimeout = errors.New("shutdown-sync-timeout must be >= 0")
ErrInvalidShutdownSyncInterval = errors.New("shutdown sync interval must be greater than 0")
ErrConfigFileNotFound = errors.New("config file not found")
)
@@ -219,6 +221,10 @@ type Config struct {
// MCP server options
MCPAddr string `yaml:"mcp-addr"`
// Shutdown sync retry settings
ShutdownSyncTimeout *time.Duration `yaml:"shutdown-sync-timeout"`
ShutdownSyncInterval *time.Duration `yaml:"shutdown-sync-interval"`
// Path to the config file
// This is only used internally to pass the config path to the MCP tool
ConfigPath string `yaml:"-"`
@@ -256,6 +262,8 @@ func DefaultConfig() Config {
defaultSnapshotRetention := 24 * time.Hour
defaultL0Retention := litestream.DefaultL0Retention
defaultL0RetentionCheckInterval := litestream.DefaultL0RetentionCheckInterval
defaultShutdownSyncTimeout := litestream.DefaultShutdownSyncTimeout
defaultShutdownSyncInterval := litestream.DefaultShutdownSyncInterval
return Config{
Levels: []*CompactionLevelConfig{
{Interval: 30 * time.Second},
@@ -268,6 +276,8 @@ func DefaultConfig() Config {
},
L0Retention: &defaultL0Retention,
L0RetentionCheckInterval: &defaultL0RetentionCheckInterval,
ShutdownSyncTimeout: &defaultShutdownSyncTimeout,
ShutdownSyncInterval: &defaultShutdownSyncInterval,
}
}
@@ -302,6 +312,20 @@ func (c *Config) Validate() error {
Value: *c.L0RetentionCheckInterval,
}
}
if c.ShutdownSyncTimeout != nil && *c.ShutdownSyncTimeout < 0 {
return &ConfigValidationError{
Err: ErrInvalidShutdownSyncTimeout,
Field: "shutdown-sync-timeout",
Value: *c.ShutdownSyncTimeout,
}
}
if c.ShutdownSyncInterval != nil && *c.ShutdownSyncInterval <= 0 {
return &ConfigValidationError{
Err: ErrInvalidShutdownSyncInterval,
Field: "shutdown-sync-interval",
Value: *c.ShutdownSyncInterval,
}
}
// Validate compaction level intervals
for i, level := range c.Levels {

View File

@@ -175,6 +175,12 @@ func (c *ReplicateCommand) Run(ctx context.Context) (err error) {
if c.Config.L0RetentionCheckInterval != nil {
c.Store.L0RetentionCheckInterval = *c.Config.L0RetentionCheckInterval
}
if c.Config.ShutdownSyncTimeout != nil {
c.Store.SetShutdownSyncTimeout(*c.Config.ShutdownSyncTimeout)
}
if c.Config.ShutdownSyncInterval != nil {
c.Store.SetShutdownSyncInterval(*c.Config.ShutdownSyncInterval)
}
if err := c.Store.Open(ctx); err != nil {
return fmt.Errorf("cannot open store: %w", err)
}

83
db.go
View File

@@ -33,6 +33,8 @@ const (
DefaultBusyTimeout = 1 * time.Second
DefaultMinCheckpointPageN = 1000
DefaultTruncatePageN = 121359 // ~500MB with 4KB page size
DefaultShutdownSyncTimeout = 30 * time.Second
DefaultShutdownSyncInterval = 500 * time.Millisecond
)
// DB represents a managed instance of a SQLite database in the file system.
@@ -136,6 +138,12 @@ type DB struct {
// Must be set before calling Open().
Replica *Replica
// Shutdown sync retry settings.
// ShutdownSyncTimeout is the total time to retry syncing on shutdown.
// ShutdownSyncInterval is the time between retry attempts.
ShutdownSyncTimeout time.Duration
ShutdownSyncInterval time.Duration
// Where to send log messages, defaults to global slog with database epath.
Logger *slog.Logger
}
@@ -155,6 +163,8 @@ func NewDB(path string) *DB {
MonitorInterval: DefaultMonitorInterval,
BusyTimeout: DefaultBusyTimeout,
L0Retention: DefaultL0Retention,
ShutdownSyncTimeout: DefaultShutdownSyncTimeout,
ShutdownSyncInterval: DefaultShutdownSyncInterval,
Logger: slog.With("db", filepath.Base(path)),
}
db.maxLTXFileInfos.m = make(map[int]*ltx.FileInfo)
@@ -322,7 +332,7 @@ func (db *DB) Close(ctx context.Context) (err error) {
// Ensure replicas perform a final sync and stop replicating.
if db.Replica != nil {
if db.db != nil {
if e := db.Replica.Sync(ctx); e != nil && err == nil {
if e := db.syncReplicaWithRetry(ctx); e != nil && err == nil {
err = e
}
}
@@ -351,6 +361,77 @@ func (db *DB) Close(ctx context.Context) (err error) {
return err
}
// syncReplicaWithRetry attempts to sync the replica with retry logic for shutdown.
// It retries until success, timeout, or context cancellation.
// If ShutdownSyncTimeout is 0, it performs a single sync attempt without retries.
func (db *DB) syncReplicaWithRetry(ctx context.Context) error {
if db.Replica == nil {
return nil
}
timeout := db.ShutdownSyncTimeout
interval := db.ShutdownSyncInterval
// If timeout is zero, just try once (no retry)
if timeout == 0 {
return db.Replica.Sync(ctx)
}
// Use default interval if not set
if interval == 0 {
interval = DefaultShutdownSyncInterval
}
// Create deadline context for total retry duration
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
var lastErr error
attempt := 0
startTime := time.Now()
for {
attempt++
// Try sync
if err := db.Replica.Sync(deadlineCtx); err == nil {
if attempt > 1 {
db.Logger.Info("shutdown sync succeeded after retry",
"attempts", attempt,
"duration", time.Since(startTime))
}
return nil
} else {
lastErr = err
}
// Check if we should stop retrying
select {
case <-deadlineCtx.Done():
db.Logger.Error("shutdown sync failed after timeout",
"attempts", attempt,
"duration", time.Since(startTime),
"lastError", lastErr)
return fmt.Errorf("shutdown sync timeout after %d attempts: %w", attempt, lastErr)
default:
}
// Log retry
db.Logger.Warn("shutdown sync failed, retrying",
"attempt", attempt,
"error", lastErr,
"elapsed", time.Since(startTime),
"remaining", time.Until(startTime.Add(timeout)))
// Wait before retry
select {
case <-time.After(interval):
case <-deadlineCtx.Done():
return fmt.Errorf("shutdown sync timeout after %d attempts: %w", attempt, lastErr)
}
}
}
// setPersistWAL sets the PERSIST_WAL file control on the database connection.
// This prevents SQLite from removing the WAL file when connections close.
func (db *DB) setPersistWAL(ctx context.Context) error {

239
db_shutdown_test.go Normal file
View File

@@ -0,0 +1,239 @@
package litestream_test
import (
"context"
"errors"
"io"
"strings"
"sync/atomic"
"testing"
"time"
"github.com/superfly/ltx"
"github.com/benbjohnson/litestream"
"github.com/benbjohnson/litestream/internal/testingutil"
"github.com/benbjohnson/litestream/mock"
)
func TestDB_Close_SyncRetry(t *testing.T) {
t.Run("SucceedsAfterTransientFailure", func(t *testing.T) {
db, sqldb := testingutil.MustOpenDBs(t)
// Write some data to create LTX files
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
t.Fatal(err)
}
if err := db.Sync(context.Background()); err != nil {
t.Fatal(err)
}
if err := sqldb.Close(); err != nil {
t.Fatal(err)
}
// Create mock client that fails first 2 times, succeeds on 3rd
var attempts int32
client := &mock.ReplicaClient{
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
return ltx.NewFileInfoSliceIterator(nil), nil
},
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, r io.Reader) (*ltx.FileInfo, error) {
n := atomic.AddInt32(&attempts, 1)
if n < 3 {
return nil, errors.New("rate limited (429)")
}
// Drain the reader
_, _ = io.Copy(io.Discard, r)
return &ltx.FileInfo{}, nil
},
}
db.Replica = litestream.NewReplicaWithClient(db, client)
db.ShutdownSyncTimeout = 5 * time.Second
db.ShutdownSyncInterval = 50 * time.Millisecond
// Close should succeed after retries
if err := db.Close(context.Background()); err != nil {
t.Fatalf("expected success after retries, got: %v", err)
}
if got := atomic.LoadInt32(&attempts); got < 3 {
t.Fatalf("expected at least 3 attempts, got %d", got)
}
})
t.Run("FailsAfterTimeout", func(t *testing.T) {
db, sqldb := testingutil.MustOpenDBs(t)
// Write some data
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
t.Fatal(err)
}
if err := db.Sync(context.Background()); err != nil {
t.Fatal(err)
}
if err := sqldb.Close(); err != nil {
t.Fatal(err)
}
// Create mock client that always fails
var attempts int32
client := &mock.ReplicaClient{
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
return ltx.NewFileInfoSliceIterator(nil), nil
},
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
atomic.AddInt32(&attempts, 1)
return nil, errors.New("persistent error")
},
}
db.Replica = litestream.NewReplicaWithClient(db, client)
db.ShutdownSyncTimeout = 300 * time.Millisecond
db.ShutdownSyncInterval = 50 * time.Millisecond
// Close should fail with timeout error
err := db.Close(context.Background())
if err == nil {
t.Fatal("expected error after timeout")
}
if !strings.Contains(err.Error(), "timeout") {
t.Fatalf("expected timeout error, got: %v", err)
}
// Should have made multiple attempts
if got := atomic.LoadInt32(&attempts); got < 2 {
t.Fatalf("expected multiple retry attempts, got %d", got)
}
})
t.Run("RespectsContextCancellation", func(t *testing.T) {
db, sqldb := testingutil.MustOpenDBs(t)
// Write some data
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
t.Fatal(err)
}
if err := db.Sync(context.Background()); err != nil {
t.Fatal(err)
}
if err := sqldb.Close(); err != nil {
t.Fatal(err)
}
// Create mock client that always fails
client := &mock.ReplicaClient{
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
return ltx.NewFileInfoSliceIterator(nil), nil
},
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
return nil, errors.New("error")
},
}
db.Replica = litestream.NewReplicaWithClient(db, client)
db.ShutdownSyncTimeout = 10 * time.Second
db.ShutdownSyncInterval = 50 * time.Millisecond
// Cancel context after short delay
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
defer cancel()
start := time.Now()
_ = db.Close(ctx)
elapsed := time.Since(start)
// Should exit within reasonable time of context cancellation
if elapsed > 500*time.Millisecond {
t.Fatalf("took too long to respect cancellation: %v", elapsed)
}
})
t.Run("ZeroTimeoutNoRetry", func(t *testing.T) {
db, sqldb := testingutil.MustOpenDBs(t)
// Write some data
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
t.Fatal(err)
}
if err := db.Sync(context.Background()); err != nil {
t.Fatal(err)
}
if err := sqldb.Close(); err != nil {
t.Fatal(err)
}
// Create mock client that always fails
var attempts int32
client := &mock.ReplicaClient{
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
return ltx.NewFileInfoSliceIterator(nil), nil
},
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
atomic.AddInt32(&attempts, 1)
return nil, errors.New("error")
},
}
db.Replica = litestream.NewReplicaWithClient(db, client)
db.ShutdownSyncTimeout = 0 // Disable retries
db.ShutdownSyncInterval = 50 * time.Millisecond
// Close should fail after single attempt
start := time.Now()
err := db.Close(context.Background())
elapsed := time.Since(start)
if err == nil {
t.Fatal("expected error")
}
// Should have only made 1 attempt
if got := atomic.LoadInt32(&attempts); got != 1 {
t.Fatalf("expected exactly 1 attempt with zero timeout, got %d", got)
}
// Should be fast (no retry delay)
if elapsed > 100*time.Millisecond {
t.Fatalf("took too long for single attempt: %v", elapsed)
}
})
t.Run("SuccessFirstAttempt", func(t *testing.T) {
db, sqldb := testingutil.MustOpenDBs(t)
// Write some data
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
t.Fatal(err)
}
if err := db.Sync(context.Background()); err != nil {
t.Fatal(err)
}
if err := sqldb.Close(); err != nil {
t.Fatal(err)
}
// Create mock client that succeeds immediately
var attempts int32
client := &mock.ReplicaClient{
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
return ltx.NewFileInfoSliceIterator(nil), nil
},
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, r io.Reader) (*ltx.FileInfo, error) {
atomic.AddInt32(&attempts, 1)
// Drain the reader
_, _ = io.Copy(io.Discard, r)
return &ltx.FileInfo{}, nil
},
}
db.Replica = litestream.NewReplicaWithClient(db, client)
db.ShutdownSyncTimeout = 5 * time.Second
db.ShutdownSyncInterval = 50 * time.Millisecond
// Close should succeed immediately
if err := db.Close(context.Background()); err != nil {
t.Fatalf("expected success, got: %v", err)
}
// Should have made exactly 1 attempt
if got := atomic.LoadInt32(&attempts); got != 1 {
t.Fatalf("expected exactly 1 attempt, got %d", got)
}
})
}

View File

@@ -174,6 +174,7 @@ func MustOpenDBAt(tb testing.TB, path string) *litestream.DB {
tb.Helper()
db := NewDB(tb, path)
db.MonitorInterval = 0 // disable background goroutine
db.ShutdownSyncTimeout = 0 // disable shutdown sync retry for faster tests
db.Replica = litestream.NewReplica(db)
db.Replica.Client = NewFileReplicaClient(tb)
db.Replica.MonitorEnabled = false // disable background goroutine

View File

@@ -71,6 +71,10 @@ type Store struct {
// If true, compaction is run in the background according to compaction levels.
CompactionMonitorEnabled bool
// Shutdown sync retry settings.
ShutdownSyncTimeout time.Duration
ShutdownSyncInterval time.Duration
}
func NewStore(dbs []*DB, levels CompactionLevels) *Store {
@@ -83,10 +87,14 @@ func NewStore(dbs []*DB, levels CompactionLevels) *Store {
L0Retention: DefaultL0Retention,
L0RetentionCheckInterval: DefaultL0RetentionCheckInterval,
CompactionMonitorEnabled: true,
ShutdownSyncTimeout: DefaultShutdownSyncTimeout,
ShutdownSyncInterval: DefaultShutdownSyncInterval,
}
for _, db := range dbs {
db.L0Retention = s.L0Retention
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
db.ShutdownSyncInterval = s.ShutdownSyncInterval
}
s.ctx, s.cancel = context.WithCancel(context.Background())
return s
@@ -178,8 +186,10 @@ func (s *Store) AddDB(db *DB) error {
}
s.mu.Unlock()
// Apply store-wide retention settings before opening the database.
// Apply store-wide settings before opening the database.
db.L0Retention = s.L0Retention
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
db.ShutdownSyncInterval = s.ShutdownSyncInterval
// Open the database without holding the lock to avoid blocking other operations.
// The double-check pattern below handles the race condition.
@@ -252,6 +262,28 @@ func (s *Store) SetL0Retention(d time.Duration) {
}
}
// SetShutdownSyncTimeout updates the shutdown sync timeout and propagates it to
// all managed databases.
func (s *Store) SetShutdownSyncTimeout(d time.Duration) {
s.mu.Lock()
defer s.mu.Unlock()
s.ShutdownSyncTimeout = d
for _, db := range s.dbs {
db.ShutdownSyncTimeout = d
}
}
// SetShutdownSyncInterval updates the shutdown sync interval and propagates it to
// all managed databases.
func (s *Store) SetShutdownSyncInterval(d time.Duration) {
s.mu.Lock()
defer s.mu.Unlock()
s.ShutdownSyncInterval = d
for _, db := range s.dbs {
db.ShutdownSyncInterval = d
}
}
// SnapshotLevel returns a pseudo compaction level based on snapshot settings.
func (s *Store) SnapshotLevel() *CompactionLevel {
return &CompactionLevel{

View File

@@ -0,0 +1,329 @@
//go:build integration && docker
package integration
import (
"bytes"
"context"
"database/sql"
"fmt"
"io"
"net"
"net/http"
"net/http/httputil"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"sync"
"sync/atomic"
"syscall"
"testing"
"time"
_ "github.com/mattn/go-sqlite3"
)
// TestShutdownSyncRetry_429Errors tests that Litestream retries syncing LTX files
// during shutdown when receiving 429 (Too Many Requests) errors.
//
// This test:
// 1. Starts a MinIO container
// 2. Starts a rate-limiting proxy in front of MinIO that returns 429 for first N PUT requests
// 3. Starts Litestream replicating to the proxy endpoint
// 4. Writes data and syncs
// 5. Sends SIGTERM to trigger graceful shutdown
// 6. Verifies that Litestream retries and eventually succeeds despite 429 errors
//
// Requirements:
// - Docker must be running
// - Litestream binary must be built at ../../bin/litestream
func TestShutdownSyncRetry_429Errors(t *testing.T) {
RequireBinaries(t)
RequireDocker(t)
t.Log("================================================")
t.Log("Litestream Shutdown Sync Retry Test (429 Errors)")
t.Log("================================================")
t.Log("")
// Start MinIO container
t.Log("Starting MinIO container...")
containerName, minioEndpoint := StartMinioTestContainer(t)
defer StopMinioTestContainer(t, containerName)
t.Logf("✓ MinIO running at: %s", minioEndpoint)
// Create MinIO bucket by creating directory in /data (MinIO stores buckets as directories)
bucket := "litestream-test"
t.Logf("Creating bucket '%s'...", bucket)
// Wait for MinIO to be ready
time.Sleep(2 * time.Second)
// Create bucket directory directly - MinIO uses /data as the storage root
createBucketCmd := exec.Command("docker", "exec", containerName,
"mkdir", "-p", "/data/"+bucket)
if out, err := createBucketCmd.CombinedOutput(); err != nil {
t.Fatalf("Failed to create bucket directory: %v, output: %s", err, string(out))
}
t.Log("✓ Bucket created")
t.Log("")
// Start rate-limiting proxy
t.Log("Starting rate-limiting proxy...")
proxy := newRateLimitingProxy(t, minioEndpoint, 3) // Return 429 for first 3 PUT requests
proxyServer := &http.Server{
Addr: "127.0.0.1:0",
Handler: proxy,
}
listener, err := (&net.ListenConfig{}).Listen(context.Background(), "tcp", "127.0.0.1:0")
if err != nil {
t.Fatalf("Failed to create listener: %v", err)
}
proxyAddr := listener.Addr().String()
go func() {
if err := proxyServer.Serve(listener); err != nil && err != http.ErrServerClosed {
t.Logf("Proxy server error: %v", err)
}
}()
defer proxyServer.Close()
proxyEndpoint := fmt.Sprintf("http://%s", proxyAddr)
t.Logf("✓ Rate-limiting proxy running at: %s", proxyEndpoint)
t.Logf(" (Will return 429 for first 3 PUT requests during shutdown)")
t.Log("")
// Setup test database
tempDir := t.TempDir()
dbPath := filepath.Join(tempDir, "test.db")
configPath := filepath.Join(tempDir, "litestream.yml")
// Create database with some data
t.Log("Creating test database...")
sqlDB, err := sql.Open("sqlite3", dbPath)
if err != nil {
t.Fatalf("Failed to open database: %v", err)
}
if _, err := sqlDB.Exec("PRAGMA journal_mode=WAL"); err != nil {
t.Fatalf("Failed to set WAL mode: %v", err)
}
if _, err := sqlDB.Exec("CREATE TABLE test (id INTEGER PRIMARY KEY, data TEXT)"); err != nil {
t.Fatalf("Failed to create table: %v", err)
}
if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES ('initial data')"); err != nil {
t.Fatalf("Failed to insert data: %v", err)
}
sqlDB.Close()
t.Log("✓ Database created with initial data")
t.Log("")
// Create Litestream config with shutdown retry settings
s3Path := fmt.Sprintf("test-%d", time.Now().Unix())
config := fmt.Sprintf(`
shutdown-sync-timeout: 10s
shutdown-sync-interval: 500ms
dbs:
- path: %s
replica:
type: s3
bucket: %s
path: %s
endpoint: %s
access-key-id: minioadmin
secret-access-key: minioadmin
region: us-east-1
force-path-style: true
skip-verify: true
sync-interval: 1s
`, dbPath, bucket, s3Path, proxyEndpoint)
if err := os.WriteFile(configPath, []byte(config), 0644); err != nil {
t.Fatalf("Failed to write config: %v", err)
}
t.Logf("✓ Config written to: %s", configPath)
t.Log("")
// Start Litestream
t.Log("Starting Litestream...")
litestreamBin := filepath.Join("..", "..", "bin", "litestream")
cmd := exec.Command(litestreamBin, "replicate", "-config", configPath)
var stdout, stderr bytes.Buffer
cmd.Stdout = io.MultiWriter(os.Stdout, &stdout)
cmd.Stderr = io.MultiWriter(os.Stderr, &stderr)
if err := cmd.Start(); err != nil {
t.Fatalf("Failed to start Litestream: %v", err)
}
t.Logf("✓ Litestream started (PID: %d)", cmd.Process.Pid)
// Wait for initial sync
t.Log("Waiting for initial sync...")
time.Sleep(3 * time.Second)
// Write more data to ensure we have pending LTX files
t.Log("Writing additional data...")
sqlDB, err = sql.Open("sqlite3", dbPath)
if err != nil {
t.Fatalf("Failed to reopen database: %v", err)
}
for i := 0; i < 5; i++ {
if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES (?)", fmt.Sprintf("data-%d", i)); err != nil {
t.Fatalf("Failed to insert data: %v", err)
}
}
sqlDB.Close()
t.Log("✓ Additional data written")
// Wait a bit for sync to pick up the changes
time.Sleep(2 * time.Second)
// Reset proxy counter so 429s happen during shutdown
proxy.Reset()
t.Log("")
t.Log("Sending SIGTERM to trigger graceful shutdown...")
t.Log("(Proxy will return 429 for first 3 PUT requests)")
// Send SIGTERM
if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
t.Fatalf("Failed to send SIGTERM: %v", err)
}
// Wait for process to exit with timeout
done := make(chan error, 1)
go func() {
done <- cmd.Wait()
}()
select {
case err := <-done:
if err != nil {
// Check if it's just a signal exit (expected)
if exitErr, ok := err.(*exec.ExitError); ok {
t.Logf("Litestream exited with: %v", exitErr)
} else {
t.Fatalf("Litestream failed: %v", err)
}
}
case <-time.After(30 * time.Second):
cmd.Process.Kill()
t.Fatal("Litestream did not exit within 30 seconds")
}
t.Log("")
t.Log("================================================")
t.Log("Results")
t.Log("================================================")
// Check proxy statistics
stats := proxy.Stats()
t.Logf("Proxy statistics:")
t.Logf(" Total requests: %d", stats.TotalRequests)
t.Logf(" 429 responses sent: %d", stats.RateLimited)
t.Logf(" Forwarded requests: %d", stats.Forwarded)
// Verify that we saw 429s and retries succeeded
output := stdout.String() + stderr.String()
if !strings.Contains(output, "shutdown sync failed, retrying") {
t.Log("")
t.Log("WARNING: Did not see retry messages in output.")
t.Log("This could mean:")
t.Log(" 1. No pending LTX files during shutdown")
t.Log(" 2. Sync completed before shutdown signal")
t.Log(" 3. Retry logic not triggered")
} else {
t.Log("")
t.Log("✓ Saw retry messages - shutdown sync retry is working!")
}
if strings.Contains(output, "shutdown sync succeeded after retry") {
t.Log("✓ Shutdown sync succeeded after retrying!")
}
if stats.RateLimited > 0 {
t.Logf("✓ Proxy returned %d 429 responses as expected", stats.RateLimited)
}
t.Log("")
t.Log("Test completed successfully!")
}
// rateLimitingProxy is an HTTP proxy that returns 429 for the first N PUT requests
type rateLimitingProxy struct {
target *url.URL
proxy *httputil.ReverseProxy
mu sync.Mutex
putCount int32
limit int32
totalReqs int64
rateLimited int64
forwarded int64
t *testing.T
}
type proxyStats struct {
TotalRequests int64
RateLimited int64
Forwarded int64
}
func newRateLimitingProxy(t *testing.T, targetURL string, limit int) *rateLimitingProxy {
target, err := url.Parse(targetURL)
if err != nil {
t.Fatalf("Failed to parse target URL: %v", err)
}
p := &rateLimitingProxy{
target: target,
limit: int32(limit),
t: t,
}
p.proxy = &httputil.ReverseProxy{
Director: func(req *http.Request) {
req.URL.Scheme = target.Scheme
req.URL.Host = target.Host
// Don't modify Host header - it's part of the AWS signature
},
}
return p
}
func (p *rateLimitingProxy) ServeHTTP(w http.ResponseWriter, r *http.Request) {
atomic.AddInt64(&p.totalReqs, 1)
// Only rate limit PUT requests (uploads)
if r.Method == "PUT" {
count := atomic.AddInt32(&p.putCount, 1)
if count <= p.limit {
atomic.AddInt64(&p.rateLimited, 1)
p.t.Logf("PROXY: Returning 429 for PUT request #%d (limit: %d)", count, p.limit)
w.Header().Set("Retry-After", "1")
w.WriteHeader(http.StatusTooManyRequests)
w.Write([]byte("Rate limit exceeded"))
return
}
}
atomic.AddInt64(&p.forwarded, 1)
p.proxy.ServeHTTP(w, r)
}
func (p *rateLimitingProxy) Reset() {
atomic.StoreInt32(&p.putCount, 0)
}
func (p *rateLimitingProxy) Stats() proxyStats {
return proxyStats{
TotalRequests: atomic.LoadInt64(&p.totalReqs),
RateLimited: atomic.LoadInt64(&p.rateLimited),
Forwarded: atomic.LoadInt64(&p.forwarded),
}
}