mirror of
https://github.com/benbjohnson/litestream.git
synced 2026-01-25 05:06:30 +00:00
feat(db): add shutdown sync retry for rate limiting resilience (#904)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -49,6 +49,8 @@ var (
|
||||
ErrInvalidSyncInterval = errors.New("sync interval must be greater than 0")
|
||||
ErrInvalidL0Retention = errors.New("l0 retention must be greater than 0")
|
||||
ErrInvalidL0RetentionCheckInterval = errors.New("l0 retention check interval must be greater than 0")
|
||||
ErrInvalidShutdownSyncTimeout = errors.New("shutdown-sync-timeout must be >= 0")
|
||||
ErrInvalidShutdownSyncInterval = errors.New("shutdown sync interval must be greater than 0")
|
||||
ErrConfigFileNotFound = errors.New("config file not found")
|
||||
)
|
||||
|
||||
@@ -219,6 +221,10 @@ type Config struct {
|
||||
// MCP server options
|
||||
MCPAddr string `yaml:"mcp-addr"`
|
||||
|
||||
// Shutdown sync retry settings
|
||||
ShutdownSyncTimeout *time.Duration `yaml:"shutdown-sync-timeout"`
|
||||
ShutdownSyncInterval *time.Duration `yaml:"shutdown-sync-interval"`
|
||||
|
||||
// Path to the config file
|
||||
// This is only used internally to pass the config path to the MCP tool
|
||||
ConfigPath string `yaml:"-"`
|
||||
@@ -256,6 +262,8 @@ func DefaultConfig() Config {
|
||||
defaultSnapshotRetention := 24 * time.Hour
|
||||
defaultL0Retention := litestream.DefaultL0Retention
|
||||
defaultL0RetentionCheckInterval := litestream.DefaultL0RetentionCheckInterval
|
||||
defaultShutdownSyncTimeout := litestream.DefaultShutdownSyncTimeout
|
||||
defaultShutdownSyncInterval := litestream.DefaultShutdownSyncInterval
|
||||
return Config{
|
||||
Levels: []*CompactionLevelConfig{
|
||||
{Interval: 30 * time.Second},
|
||||
@@ -268,6 +276,8 @@ func DefaultConfig() Config {
|
||||
},
|
||||
L0Retention: &defaultL0Retention,
|
||||
L0RetentionCheckInterval: &defaultL0RetentionCheckInterval,
|
||||
ShutdownSyncTimeout: &defaultShutdownSyncTimeout,
|
||||
ShutdownSyncInterval: &defaultShutdownSyncInterval,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -302,6 +312,20 @@ func (c *Config) Validate() error {
|
||||
Value: *c.L0RetentionCheckInterval,
|
||||
}
|
||||
}
|
||||
if c.ShutdownSyncTimeout != nil && *c.ShutdownSyncTimeout < 0 {
|
||||
return &ConfigValidationError{
|
||||
Err: ErrInvalidShutdownSyncTimeout,
|
||||
Field: "shutdown-sync-timeout",
|
||||
Value: *c.ShutdownSyncTimeout,
|
||||
}
|
||||
}
|
||||
if c.ShutdownSyncInterval != nil && *c.ShutdownSyncInterval <= 0 {
|
||||
return &ConfigValidationError{
|
||||
Err: ErrInvalidShutdownSyncInterval,
|
||||
Field: "shutdown-sync-interval",
|
||||
Value: *c.ShutdownSyncInterval,
|
||||
}
|
||||
}
|
||||
|
||||
// Validate compaction level intervals
|
||||
for i, level := range c.Levels {
|
||||
|
||||
@@ -175,6 +175,12 @@ func (c *ReplicateCommand) Run(ctx context.Context) (err error) {
|
||||
if c.Config.L0RetentionCheckInterval != nil {
|
||||
c.Store.L0RetentionCheckInterval = *c.Config.L0RetentionCheckInterval
|
||||
}
|
||||
if c.Config.ShutdownSyncTimeout != nil {
|
||||
c.Store.SetShutdownSyncTimeout(*c.Config.ShutdownSyncTimeout)
|
||||
}
|
||||
if c.Config.ShutdownSyncInterval != nil {
|
||||
c.Store.SetShutdownSyncInterval(*c.Config.ShutdownSyncInterval)
|
||||
}
|
||||
if err := c.Store.Open(ctx); err != nil {
|
||||
return fmt.Errorf("cannot open store: %w", err)
|
||||
}
|
||||
|
||||
83
db.go
83
db.go
@@ -33,6 +33,8 @@ const (
|
||||
DefaultBusyTimeout = 1 * time.Second
|
||||
DefaultMinCheckpointPageN = 1000
|
||||
DefaultTruncatePageN = 121359 // ~500MB with 4KB page size
|
||||
DefaultShutdownSyncTimeout = 30 * time.Second
|
||||
DefaultShutdownSyncInterval = 500 * time.Millisecond
|
||||
)
|
||||
|
||||
// DB represents a managed instance of a SQLite database in the file system.
|
||||
@@ -136,6 +138,12 @@ type DB struct {
|
||||
// Must be set before calling Open().
|
||||
Replica *Replica
|
||||
|
||||
// Shutdown sync retry settings.
|
||||
// ShutdownSyncTimeout is the total time to retry syncing on shutdown.
|
||||
// ShutdownSyncInterval is the time between retry attempts.
|
||||
ShutdownSyncTimeout time.Duration
|
||||
ShutdownSyncInterval time.Duration
|
||||
|
||||
// Where to send log messages, defaults to global slog with database epath.
|
||||
Logger *slog.Logger
|
||||
}
|
||||
@@ -155,6 +163,8 @@ func NewDB(path string) *DB {
|
||||
MonitorInterval: DefaultMonitorInterval,
|
||||
BusyTimeout: DefaultBusyTimeout,
|
||||
L0Retention: DefaultL0Retention,
|
||||
ShutdownSyncTimeout: DefaultShutdownSyncTimeout,
|
||||
ShutdownSyncInterval: DefaultShutdownSyncInterval,
|
||||
Logger: slog.With("db", filepath.Base(path)),
|
||||
}
|
||||
db.maxLTXFileInfos.m = make(map[int]*ltx.FileInfo)
|
||||
@@ -322,7 +332,7 @@ func (db *DB) Close(ctx context.Context) (err error) {
|
||||
// Ensure replicas perform a final sync and stop replicating.
|
||||
if db.Replica != nil {
|
||||
if db.db != nil {
|
||||
if e := db.Replica.Sync(ctx); e != nil && err == nil {
|
||||
if e := db.syncReplicaWithRetry(ctx); e != nil && err == nil {
|
||||
err = e
|
||||
}
|
||||
}
|
||||
@@ -351,6 +361,77 @@ func (db *DB) Close(ctx context.Context) (err error) {
|
||||
return err
|
||||
}
|
||||
|
||||
// syncReplicaWithRetry attempts to sync the replica with retry logic for shutdown.
|
||||
// It retries until success, timeout, or context cancellation.
|
||||
// If ShutdownSyncTimeout is 0, it performs a single sync attempt without retries.
|
||||
func (db *DB) syncReplicaWithRetry(ctx context.Context) error {
|
||||
if db.Replica == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
timeout := db.ShutdownSyncTimeout
|
||||
interval := db.ShutdownSyncInterval
|
||||
|
||||
// If timeout is zero, just try once (no retry)
|
||||
if timeout == 0 {
|
||||
return db.Replica.Sync(ctx)
|
||||
}
|
||||
|
||||
// Use default interval if not set
|
||||
if interval == 0 {
|
||||
interval = DefaultShutdownSyncInterval
|
||||
}
|
||||
|
||||
// Create deadline context for total retry duration
|
||||
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
|
||||
defer cancel()
|
||||
|
||||
var lastErr error
|
||||
attempt := 0
|
||||
startTime := time.Now()
|
||||
|
||||
for {
|
||||
attempt++
|
||||
|
||||
// Try sync
|
||||
if err := db.Replica.Sync(deadlineCtx); err == nil {
|
||||
if attempt > 1 {
|
||||
db.Logger.Info("shutdown sync succeeded after retry",
|
||||
"attempts", attempt,
|
||||
"duration", time.Since(startTime))
|
||||
}
|
||||
return nil
|
||||
} else {
|
||||
lastErr = err
|
||||
}
|
||||
|
||||
// Check if we should stop retrying
|
||||
select {
|
||||
case <-deadlineCtx.Done():
|
||||
db.Logger.Error("shutdown sync failed after timeout",
|
||||
"attempts", attempt,
|
||||
"duration", time.Since(startTime),
|
||||
"lastError", lastErr)
|
||||
return fmt.Errorf("shutdown sync timeout after %d attempts: %w", attempt, lastErr)
|
||||
default:
|
||||
}
|
||||
|
||||
// Log retry
|
||||
db.Logger.Warn("shutdown sync failed, retrying",
|
||||
"attempt", attempt,
|
||||
"error", lastErr,
|
||||
"elapsed", time.Since(startTime),
|
||||
"remaining", time.Until(startTime.Add(timeout)))
|
||||
|
||||
// Wait before retry
|
||||
select {
|
||||
case <-time.After(interval):
|
||||
case <-deadlineCtx.Done():
|
||||
return fmt.Errorf("shutdown sync timeout after %d attempts: %w", attempt, lastErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// setPersistWAL sets the PERSIST_WAL file control on the database connection.
|
||||
// This prevents SQLite from removing the WAL file when connections close.
|
||||
func (db *DB) setPersistWAL(ctx context.Context) error {
|
||||
|
||||
239
db_shutdown_test.go
Normal file
239
db_shutdown_test.go
Normal file
@@ -0,0 +1,239 @@
|
||||
package litestream_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"io"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/superfly/ltx"
|
||||
|
||||
"github.com/benbjohnson/litestream"
|
||||
"github.com/benbjohnson/litestream/internal/testingutil"
|
||||
"github.com/benbjohnson/litestream/mock"
|
||||
)
|
||||
|
||||
func TestDB_Close_SyncRetry(t *testing.T) {
|
||||
t.Run("SucceedsAfterTransientFailure", func(t *testing.T) {
|
||||
db, sqldb := testingutil.MustOpenDBs(t)
|
||||
|
||||
// Write some data to create LTX files
|
||||
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := db.Sync(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := sqldb.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create mock client that fails first 2 times, succeeds on 3rd
|
||||
var attempts int32
|
||||
client := &mock.ReplicaClient{
|
||||
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
|
||||
return ltx.NewFileInfoSliceIterator(nil), nil
|
||||
},
|
||||
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, r io.Reader) (*ltx.FileInfo, error) {
|
||||
n := atomic.AddInt32(&attempts, 1)
|
||||
if n < 3 {
|
||||
return nil, errors.New("rate limited (429)")
|
||||
}
|
||||
// Drain the reader
|
||||
_, _ = io.Copy(io.Discard, r)
|
||||
return <x.FileInfo{}, nil
|
||||
},
|
||||
}
|
||||
|
||||
db.Replica = litestream.NewReplicaWithClient(db, client)
|
||||
db.ShutdownSyncTimeout = 5 * time.Second
|
||||
db.ShutdownSyncInterval = 50 * time.Millisecond
|
||||
|
||||
// Close should succeed after retries
|
||||
if err := db.Close(context.Background()); err != nil {
|
||||
t.Fatalf("expected success after retries, got: %v", err)
|
||||
}
|
||||
if got := atomic.LoadInt32(&attempts); got < 3 {
|
||||
t.Fatalf("expected at least 3 attempts, got %d", got)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("FailsAfterTimeout", func(t *testing.T) {
|
||||
db, sqldb := testingutil.MustOpenDBs(t)
|
||||
|
||||
// Write some data
|
||||
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := db.Sync(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := sqldb.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create mock client that always fails
|
||||
var attempts int32
|
||||
client := &mock.ReplicaClient{
|
||||
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
|
||||
return ltx.NewFileInfoSliceIterator(nil), nil
|
||||
},
|
||||
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
|
||||
atomic.AddInt32(&attempts, 1)
|
||||
return nil, errors.New("persistent error")
|
||||
},
|
||||
}
|
||||
|
||||
db.Replica = litestream.NewReplicaWithClient(db, client)
|
||||
db.ShutdownSyncTimeout = 300 * time.Millisecond
|
||||
db.ShutdownSyncInterval = 50 * time.Millisecond
|
||||
|
||||
// Close should fail with timeout error
|
||||
err := db.Close(context.Background())
|
||||
if err == nil {
|
||||
t.Fatal("expected error after timeout")
|
||||
}
|
||||
if !strings.Contains(err.Error(), "timeout") {
|
||||
t.Fatalf("expected timeout error, got: %v", err)
|
||||
}
|
||||
// Should have made multiple attempts
|
||||
if got := atomic.LoadInt32(&attempts); got < 2 {
|
||||
t.Fatalf("expected multiple retry attempts, got %d", got)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("RespectsContextCancellation", func(t *testing.T) {
|
||||
db, sqldb := testingutil.MustOpenDBs(t)
|
||||
|
||||
// Write some data
|
||||
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := db.Sync(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := sqldb.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create mock client that always fails
|
||||
client := &mock.ReplicaClient{
|
||||
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
|
||||
return ltx.NewFileInfoSliceIterator(nil), nil
|
||||
},
|
||||
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
|
||||
return nil, errors.New("error")
|
||||
},
|
||||
}
|
||||
|
||||
db.Replica = litestream.NewReplicaWithClient(db, client)
|
||||
db.ShutdownSyncTimeout = 10 * time.Second
|
||||
db.ShutdownSyncInterval = 50 * time.Millisecond
|
||||
|
||||
// Cancel context after short delay
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 150*time.Millisecond)
|
||||
defer cancel()
|
||||
|
||||
start := time.Now()
|
||||
_ = db.Close(ctx)
|
||||
elapsed := time.Since(start)
|
||||
|
||||
// Should exit within reasonable time of context cancellation
|
||||
if elapsed > 500*time.Millisecond {
|
||||
t.Fatalf("took too long to respect cancellation: %v", elapsed)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("ZeroTimeoutNoRetry", func(t *testing.T) {
|
||||
db, sqldb := testingutil.MustOpenDBs(t)
|
||||
|
||||
// Write some data
|
||||
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := db.Sync(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := sqldb.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create mock client that always fails
|
||||
var attempts int32
|
||||
client := &mock.ReplicaClient{
|
||||
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
|
||||
return ltx.NewFileInfoSliceIterator(nil), nil
|
||||
},
|
||||
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, _ io.Reader) (*ltx.FileInfo, error) {
|
||||
atomic.AddInt32(&attempts, 1)
|
||||
return nil, errors.New("error")
|
||||
},
|
||||
}
|
||||
|
||||
db.Replica = litestream.NewReplicaWithClient(db, client)
|
||||
db.ShutdownSyncTimeout = 0 // Disable retries
|
||||
db.ShutdownSyncInterval = 50 * time.Millisecond
|
||||
|
||||
// Close should fail after single attempt
|
||||
start := time.Now()
|
||||
err := db.Close(context.Background())
|
||||
elapsed := time.Since(start)
|
||||
|
||||
if err == nil {
|
||||
t.Fatal("expected error")
|
||||
}
|
||||
// Should have only made 1 attempt
|
||||
if got := atomic.LoadInt32(&attempts); got != 1 {
|
||||
t.Fatalf("expected exactly 1 attempt with zero timeout, got %d", got)
|
||||
}
|
||||
// Should be fast (no retry delay)
|
||||
if elapsed > 100*time.Millisecond {
|
||||
t.Fatalf("took too long for single attempt: %v", elapsed)
|
||||
}
|
||||
})
|
||||
|
||||
t.Run("SuccessFirstAttempt", func(t *testing.T) {
|
||||
db, sqldb := testingutil.MustOpenDBs(t)
|
||||
|
||||
// Write some data
|
||||
if _, err := sqldb.Exec(`CREATE TABLE t (x)`); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := db.Sync(context.Background()); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := sqldb.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
// Create mock client that succeeds immediately
|
||||
var attempts int32
|
||||
client := &mock.ReplicaClient{
|
||||
LTXFilesFunc: func(_ context.Context, _ int, _ ltx.TXID, _ bool) (ltx.FileIterator, error) {
|
||||
return ltx.NewFileInfoSliceIterator(nil), nil
|
||||
},
|
||||
WriteLTXFileFunc: func(_ context.Context, _ int, _, _ ltx.TXID, r io.Reader) (*ltx.FileInfo, error) {
|
||||
atomic.AddInt32(&attempts, 1)
|
||||
// Drain the reader
|
||||
_, _ = io.Copy(io.Discard, r)
|
||||
return <x.FileInfo{}, nil
|
||||
},
|
||||
}
|
||||
|
||||
db.Replica = litestream.NewReplicaWithClient(db, client)
|
||||
db.ShutdownSyncTimeout = 5 * time.Second
|
||||
db.ShutdownSyncInterval = 50 * time.Millisecond
|
||||
|
||||
// Close should succeed immediately
|
||||
if err := db.Close(context.Background()); err != nil {
|
||||
t.Fatalf("expected success, got: %v", err)
|
||||
}
|
||||
// Should have made exactly 1 attempt
|
||||
if got := atomic.LoadInt32(&attempts); got != 1 {
|
||||
t.Fatalf("expected exactly 1 attempt, got %d", got)
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -174,6 +174,7 @@ func MustOpenDBAt(tb testing.TB, path string) *litestream.DB {
|
||||
tb.Helper()
|
||||
db := NewDB(tb, path)
|
||||
db.MonitorInterval = 0 // disable background goroutine
|
||||
db.ShutdownSyncTimeout = 0 // disable shutdown sync retry for faster tests
|
||||
db.Replica = litestream.NewReplica(db)
|
||||
db.Replica.Client = NewFileReplicaClient(tb)
|
||||
db.Replica.MonitorEnabled = false // disable background goroutine
|
||||
|
||||
34
store.go
34
store.go
@@ -71,6 +71,10 @@ type Store struct {
|
||||
|
||||
// If true, compaction is run in the background according to compaction levels.
|
||||
CompactionMonitorEnabled bool
|
||||
|
||||
// Shutdown sync retry settings.
|
||||
ShutdownSyncTimeout time.Duration
|
||||
ShutdownSyncInterval time.Duration
|
||||
}
|
||||
|
||||
func NewStore(dbs []*DB, levels CompactionLevels) *Store {
|
||||
@@ -83,10 +87,14 @@ func NewStore(dbs []*DB, levels CompactionLevels) *Store {
|
||||
L0Retention: DefaultL0Retention,
|
||||
L0RetentionCheckInterval: DefaultL0RetentionCheckInterval,
|
||||
CompactionMonitorEnabled: true,
|
||||
ShutdownSyncTimeout: DefaultShutdownSyncTimeout,
|
||||
ShutdownSyncInterval: DefaultShutdownSyncInterval,
|
||||
}
|
||||
|
||||
for _, db := range dbs {
|
||||
db.L0Retention = s.L0Retention
|
||||
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
|
||||
db.ShutdownSyncInterval = s.ShutdownSyncInterval
|
||||
}
|
||||
s.ctx, s.cancel = context.WithCancel(context.Background())
|
||||
return s
|
||||
@@ -178,8 +186,10 @@ func (s *Store) AddDB(db *DB) error {
|
||||
}
|
||||
s.mu.Unlock()
|
||||
|
||||
// Apply store-wide retention settings before opening the database.
|
||||
// Apply store-wide settings before opening the database.
|
||||
db.L0Retention = s.L0Retention
|
||||
db.ShutdownSyncTimeout = s.ShutdownSyncTimeout
|
||||
db.ShutdownSyncInterval = s.ShutdownSyncInterval
|
||||
|
||||
// Open the database without holding the lock to avoid blocking other operations.
|
||||
// The double-check pattern below handles the race condition.
|
||||
@@ -252,6 +262,28 @@ func (s *Store) SetL0Retention(d time.Duration) {
|
||||
}
|
||||
}
|
||||
|
||||
// SetShutdownSyncTimeout updates the shutdown sync timeout and propagates it to
|
||||
// all managed databases.
|
||||
func (s *Store) SetShutdownSyncTimeout(d time.Duration) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.ShutdownSyncTimeout = d
|
||||
for _, db := range s.dbs {
|
||||
db.ShutdownSyncTimeout = d
|
||||
}
|
||||
}
|
||||
|
||||
// SetShutdownSyncInterval updates the shutdown sync interval and propagates it to
|
||||
// all managed databases.
|
||||
func (s *Store) SetShutdownSyncInterval(d time.Duration) {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.ShutdownSyncInterval = d
|
||||
for _, db := range s.dbs {
|
||||
db.ShutdownSyncInterval = d
|
||||
}
|
||||
}
|
||||
|
||||
// SnapshotLevel returns a pseudo compaction level based on snapshot settings.
|
||||
func (s *Store) SnapshotLevel() *CompactionLevel {
|
||||
return &CompactionLevel{
|
||||
|
||||
329
tests/integration/shutdown_retry_test.go
Normal file
329
tests/integration/shutdown_retry_test.go
Normal file
@@ -0,0 +1,329 @@
|
||||
//go:build integration && docker
|
||||
|
||||
package integration
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/http/httputil"
|
||||
"net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
_ "github.com/mattn/go-sqlite3"
|
||||
)
|
||||
|
||||
// TestShutdownSyncRetry_429Errors tests that Litestream retries syncing LTX files
|
||||
// during shutdown when receiving 429 (Too Many Requests) errors.
|
||||
//
|
||||
// This test:
|
||||
// 1. Starts a MinIO container
|
||||
// 2. Starts a rate-limiting proxy in front of MinIO that returns 429 for first N PUT requests
|
||||
// 3. Starts Litestream replicating to the proxy endpoint
|
||||
// 4. Writes data and syncs
|
||||
// 5. Sends SIGTERM to trigger graceful shutdown
|
||||
// 6. Verifies that Litestream retries and eventually succeeds despite 429 errors
|
||||
//
|
||||
// Requirements:
|
||||
// - Docker must be running
|
||||
// - Litestream binary must be built at ../../bin/litestream
|
||||
func TestShutdownSyncRetry_429Errors(t *testing.T) {
|
||||
RequireBinaries(t)
|
||||
RequireDocker(t)
|
||||
|
||||
t.Log("================================================")
|
||||
t.Log("Litestream Shutdown Sync Retry Test (429 Errors)")
|
||||
t.Log("================================================")
|
||||
t.Log("")
|
||||
|
||||
// Start MinIO container
|
||||
t.Log("Starting MinIO container...")
|
||||
containerName, minioEndpoint := StartMinioTestContainer(t)
|
||||
defer StopMinioTestContainer(t, containerName)
|
||||
t.Logf("✓ MinIO running at: %s", minioEndpoint)
|
||||
|
||||
// Create MinIO bucket by creating directory in /data (MinIO stores buckets as directories)
|
||||
bucket := "litestream-test"
|
||||
t.Logf("Creating bucket '%s'...", bucket)
|
||||
|
||||
// Wait for MinIO to be ready
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
// Create bucket directory directly - MinIO uses /data as the storage root
|
||||
createBucketCmd := exec.Command("docker", "exec", containerName,
|
||||
"mkdir", "-p", "/data/"+bucket)
|
||||
if out, err := createBucketCmd.CombinedOutput(); err != nil {
|
||||
t.Fatalf("Failed to create bucket directory: %v, output: %s", err, string(out))
|
||||
}
|
||||
t.Log("✓ Bucket created")
|
||||
t.Log("")
|
||||
|
||||
// Start rate-limiting proxy
|
||||
t.Log("Starting rate-limiting proxy...")
|
||||
proxy := newRateLimitingProxy(t, minioEndpoint, 3) // Return 429 for first 3 PUT requests
|
||||
proxyServer := &http.Server{
|
||||
Addr: "127.0.0.1:0",
|
||||
Handler: proxy,
|
||||
}
|
||||
|
||||
listener, err := (&net.ListenConfig{}).Listen(context.Background(), "tcp", "127.0.0.1:0")
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to create listener: %v", err)
|
||||
}
|
||||
proxyAddr := listener.Addr().String()
|
||||
|
||||
go func() {
|
||||
if err := proxyServer.Serve(listener); err != nil && err != http.ErrServerClosed {
|
||||
t.Logf("Proxy server error: %v", err)
|
||||
}
|
||||
}()
|
||||
defer proxyServer.Close()
|
||||
|
||||
proxyEndpoint := fmt.Sprintf("http://%s", proxyAddr)
|
||||
t.Logf("✓ Rate-limiting proxy running at: %s", proxyEndpoint)
|
||||
t.Logf(" (Will return 429 for first 3 PUT requests during shutdown)")
|
||||
t.Log("")
|
||||
|
||||
// Setup test database
|
||||
tempDir := t.TempDir()
|
||||
dbPath := filepath.Join(tempDir, "test.db")
|
||||
configPath := filepath.Join(tempDir, "litestream.yml")
|
||||
|
||||
// Create database with some data
|
||||
t.Log("Creating test database...")
|
||||
sqlDB, err := sql.Open("sqlite3", dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to open database: %v", err)
|
||||
}
|
||||
|
||||
if _, err := sqlDB.Exec("PRAGMA journal_mode=WAL"); err != nil {
|
||||
t.Fatalf("Failed to set WAL mode: %v", err)
|
||||
}
|
||||
if _, err := sqlDB.Exec("CREATE TABLE test (id INTEGER PRIMARY KEY, data TEXT)"); err != nil {
|
||||
t.Fatalf("Failed to create table: %v", err)
|
||||
}
|
||||
if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES ('initial data')"); err != nil {
|
||||
t.Fatalf("Failed to insert data: %v", err)
|
||||
}
|
||||
sqlDB.Close()
|
||||
t.Log("✓ Database created with initial data")
|
||||
t.Log("")
|
||||
|
||||
// Create Litestream config with shutdown retry settings
|
||||
s3Path := fmt.Sprintf("test-%d", time.Now().Unix())
|
||||
config := fmt.Sprintf(`
|
||||
shutdown-sync-timeout: 10s
|
||||
shutdown-sync-interval: 500ms
|
||||
|
||||
dbs:
|
||||
- path: %s
|
||||
replica:
|
||||
type: s3
|
||||
bucket: %s
|
||||
path: %s
|
||||
endpoint: %s
|
||||
access-key-id: minioadmin
|
||||
secret-access-key: minioadmin
|
||||
region: us-east-1
|
||||
force-path-style: true
|
||||
skip-verify: true
|
||||
sync-interval: 1s
|
||||
`, dbPath, bucket, s3Path, proxyEndpoint)
|
||||
|
||||
if err := os.WriteFile(configPath, []byte(config), 0644); err != nil {
|
||||
t.Fatalf("Failed to write config: %v", err)
|
||||
}
|
||||
t.Logf("✓ Config written to: %s", configPath)
|
||||
t.Log("")
|
||||
|
||||
// Start Litestream
|
||||
t.Log("Starting Litestream...")
|
||||
litestreamBin := filepath.Join("..", "..", "bin", "litestream")
|
||||
cmd := exec.Command(litestreamBin, "replicate", "-config", configPath)
|
||||
|
||||
var stdout, stderr bytes.Buffer
|
||||
cmd.Stdout = io.MultiWriter(os.Stdout, &stdout)
|
||||
cmd.Stderr = io.MultiWriter(os.Stderr, &stderr)
|
||||
|
||||
if err := cmd.Start(); err != nil {
|
||||
t.Fatalf("Failed to start Litestream: %v", err)
|
||||
}
|
||||
t.Logf("✓ Litestream started (PID: %d)", cmd.Process.Pid)
|
||||
|
||||
// Wait for initial sync
|
||||
t.Log("Waiting for initial sync...")
|
||||
time.Sleep(3 * time.Second)
|
||||
|
||||
// Write more data to ensure we have pending LTX files
|
||||
t.Log("Writing additional data...")
|
||||
sqlDB, err = sql.Open("sqlite3", dbPath)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to reopen database: %v", err)
|
||||
}
|
||||
for i := 0; i < 5; i++ {
|
||||
if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES (?)", fmt.Sprintf("data-%d", i)); err != nil {
|
||||
t.Fatalf("Failed to insert data: %v", err)
|
||||
}
|
||||
}
|
||||
sqlDB.Close()
|
||||
t.Log("✓ Additional data written")
|
||||
|
||||
// Wait a bit for sync to pick up the changes
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
// Reset proxy counter so 429s happen during shutdown
|
||||
proxy.Reset()
|
||||
t.Log("")
|
||||
t.Log("Sending SIGTERM to trigger graceful shutdown...")
|
||||
t.Log("(Proxy will return 429 for first 3 PUT requests)")
|
||||
|
||||
// Send SIGTERM
|
||||
if err := cmd.Process.Signal(syscall.SIGTERM); err != nil {
|
||||
t.Fatalf("Failed to send SIGTERM: %v", err)
|
||||
}
|
||||
|
||||
// Wait for process to exit with timeout
|
||||
done := make(chan error, 1)
|
||||
go func() {
|
||||
done <- cmd.Wait()
|
||||
}()
|
||||
|
||||
select {
|
||||
case err := <-done:
|
||||
if err != nil {
|
||||
// Check if it's just a signal exit (expected)
|
||||
if exitErr, ok := err.(*exec.ExitError); ok {
|
||||
t.Logf("Litestream exited with: %v", exitErr)
|
||||
} else {
|
||||
t.Fatalf("Litestream failed: %v", err)
|
||||
}
|
||||
}
|
||||
case <-time.After(30 * time.Second):
|
||||
cmd.Process.Kill()
|
||||
t.Fatal("Litestream did not exit within 30 seconds")
|
||||
}
|
||||
|
||||
t.Log("")
|
||||
t.Log("================================================")
|
||||
t.Log("Results")
|
||||
t.Log("================================================")
|
||||
|
||||
// Check proxy statistics
|
||||
stats := proxy.Stats()
|
||||
t.Logf("Proxy statistics:")
|
||||
t.Logf(" Total requests: %d", stats.TotalRequests)
|
||||
t.Logf(" 429 responses sent: %d", stats.RateLimited)
|
||||
t.Logf(" Forwarded requests: %d", stats.Forwarded)
|
||||
|
||||
// Verify that we saw 429s and retries succeeded
|
||||
output := stdout.String() + stderr.String()
|
||||
|
||||
if !strings.Contains(output, "shutdown sync failed, retrying") {
|
||||
t.Log("")
|
||||
t.Log("WARNING: Did not see retry messages in output.")
|
||||
t.Log("This could mean:")
|
||||
t.Log(" 1. No pending LTX files during shutdown")
|
||||
t.Log(" 2. Sync completed before shutdown signal")
|
||||
t.Log(" 3. Retry logic not triggered")
|
||||
} else {
|
||||
t.Log("")
|
||||
t.Log("✓ Saw retry messages - shutdown sync retry is working!")
|
||||
}
|
||||
|
||||
if strings.Contains(output, "shutdown sync succeeded after retry") {
|
||||
t.Log("✓ Shutdown sync succeeded after retrying!")
|
||||
}
|
||||
|
||||
if stats.RateLimited > 0 {
|
||||
t.Logf("✓ Proxy returned %d 429 responses as expected", stats.RateLimited)
|
||||
}
|
||||
|
||||
t.Log("")
|
||||
t.Log("Test completed successfully!")
|
||||
}
|
||||
|
||||
// rateLimitingProxy is an HTTP proxy that returns 429 for the first N PUT requests
|
||||
type rateLimitingProxy struct {
|
||||
target *url.URL
|
||||
proxy *httputil.ReverseProxy
|
||||
mu sync.Mutex
|
||||
putCount int32
|
||||
limit int32
|
||||
totalReqs int64
|
||||
rateLimited int64
|
||||
forwarded int64
|
||||
t *testing.T
|
||||
}
|
||||
|
||||
type proxyStats struct {
|
||||
TotalRequests int64
|
||||
RateLimited int64
|
||||
Forwarded int64
|
||||
}
|
||||
|
||||
func newRateLimitingProxy(t *testing.T, targetURL string, limit int) *rateLimitingProxy {
|
||||
target, err := url.Parse(targetURL)
|
||||
if err != nil {
|
||||
t.Fatalf("Failed to parse target URL: %v", err)
|
||||
}
|
||||
|
||||
p := &rateLimitingProxy{
|
||||
target: target,
|
||||
limit: int32(limit),
|
||||
t: t,
|
||||
}
|
||||
|
||||
p.proxy = &httputil.ReverseProxy{
|
||||
Director: func(req *http.Request) {
|
||||
req.URL.Scheme = target.Scheme
|
||||
req.URL.Host = target.Host
|
||||
// Don't modify Host header - it's part of the AWS signature
|
||||
},
|
||||
}
|
||||
|
||||
return p
|
||||
}
|
||||
|
||||
func (p *rateLimitingProxy) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt64(&p.totalReqs, 1)
|
||||
|
||||
// Only rate limit PUT requests (uploads)
|
||||
if r.Method == "PUT" {
|
||||
count := atomic.AddInt32(&p.putCount, 1)
|
||||
if count <= p.limit {
|
||||
atomic.AddInt64(&p.rateLimited, 1)
|
||||
p.t.Logf("PROXY: Returning 429 for PUT request #%d (limit: %d)", count, p.limit)
|
||||
w.Header().Set("Retry-After", "1")
|
||||
w.WriteHeader(http.StatusTooManyRequests)
|
||||
w.Write([]byte("Rate limit exceeded"))
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
atomic.AddInt64(&p.forwarded, 1)
|
||||
p.proxy.ServeHTTP(w, r)
|
||||
}
|
||||
|
||||
func (p *rateLimitingProxy) Reset() {
|
||||
atomic.StoreInt32(&p.putCount, 0)
|
||||
}
|
||||
|
||||
func (p *rateLimitingProxy) Stats() proxyStats {
|
||||
return proxyStats{
|
||||
TotalRequests: atomic.LoadInt64(&p.totalReqs),
|
||||
RateLimited: atomic.LoadInt64(&p.rateLimited),
|
||||
Forwarded: atomic.LoadInt64(&p.forwarded),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user