mirror of
https://github.com/benbjohnson/litestream.git
synced 2026-01-25 05:06:30 +00:00
fix(sync): add exponential backoff for error resilience (#931)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
72
db.go
72
db.go
@@ -35,6 +35,11 @@ const (
|
||||
DefaultTruncatePageN = 121359 // ~500MB with 4KB page size
|
||||
DefaultShutdownSyncTimeout = 30 * time.Second
|
||||
DefaultShutdownSyncInterval = 500 * time.Millisecond
|
||||
|
||||
// Sync error backoff configuration.
|
||||
// When sync errors occur repeatedly (e.g., disk full), backoff doubles each time.
|
||||
DefaultSyncBackoffMax = 5 * time.Minute // Maximum backoff between retries
|
||||
SyncErrorLogInterval = 30 * time.Second // Rate-limit repeated error logging
|
||||
)
|
||||
|
||||
// DB represents a managed instance of a SQLite database in the file system.
|
||||
@@ -853,6 +858,19 @@ func isSQLiteBusyError(err error) bool {
|
||||
strings.Contains(errStr, "SQLITE_BUSY")
|
||||
}
|
||||
|
||||
// isDiskFullError returns true if the error indicates disk space issues.
|
||||
// This includes "no space left on device" (ENOSPC) and "disk quota exceeded" (EDQUOT).
|
||||
func isDiskFullError(err error) bool {
|
||||
if err == nil {
|
||||
return false
|
||||
}
|
||||
errStr := strings.ToLower(err.Error())
|
||||
return strings.Contains(errStr, "no space left on device") ||
|
||||
strings.Contains(errStr, "disk quota exceeded") ||
|
||||
strings.Contains(errStr, "enospc") ||
|
||||
strings.Contains(errStr, "edquot")
|
||||
}
|
||||
|
||||
// walFileSize returns the size of the WAL file in bytes.
|
||||
func (db *DB) walFileSize() (int64, error) {
|
||||
fi, err := os.Stat(db.WALPath())
|
||||
@@ -1993,6 +2011,9 @@ func (db *DB) EnforceRetentionByTXID(ctx context.Context, level int, txID ltx.TX
|
||||
// When a change is detected, we call Sync() which performs full verification
|
||||
// and replication. The cost per tick is just one stat() call plus a 32-byte
|
||||
// read, which is negligible.
|
||||
//
|
||||
// Implements exponential backoff on repeated sync errors to prevent disk churn
|
||||
// when persistent errors (like disk full) occur. See issue #927.
|
||||
func (db *DB) monitor() {
|
||||
ticker := time.NewTicker(db.MonitorInterval)
|
||||
defer ticker.Stop()
|
||||
@@ -2001,6 +2022,11 @@ func (db *DB) monitor() {
|
||||
var lastWALSize int64
|
||||
var lastWALHeader []byte
|
||||
|
||||
// Backoff state for error handling.
|
||||
var backoff time.Duration
|
||||
var lastLogTime time.Time
|
||||
var consecutiveErrs int
|
||||
|
||||
for {
|
||||
// Wait for ticker or context close.
|
||||
select {
|
||||
@@ -2009,6 +2035,15 @@ func (db *DB) monitor() {
|
||||
case <-ticker.C:
|
||||
}
|
||||
|
||||
// If in backoff mode, wait additional time before retrying.
|
||||
if backoff > 0 {
|
||||
select {
|
||||
case <-db.ctx.Done():
|
||||
return
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
}
|
||||
|
||||
// Check if WAL has changed before doing expensive sync.
|
||||
walPath := db.WALPath()
|
||||
fi, err := os.Stat(walPath)
|
||||
@@ -2045,9 +2080,44 @@ func (db *DB) monitor() {
|
||||
lastWALHeader = walHeader
|
||||
|
||||
if err := db.Sync(db.ctx); err != nil && !errors.Is(err, context.Canceled) {
|
||||
db.Logger.Error("sync error", "error", err)
|
||||
consecutiveErrs++
|
||||
|
||||
// Exponential backoff: MonitorInterval -> 2x -> 4x -> ... -> max
|
||||
if backoff == 0 {
|
||||
backoff = db.MonitorInterval
|
||||
} else {
|
||||
backoff *= 2
|
||||
if backoff > DefaultSyncBackoffMax {
|
||||
backoff = DefaultSyncBackoffMax
|
||||
}
|
||||
}
|
||||
|
||||
// Log with rate limiting to avoid log spam during persistent errors.
|
||||
if time.Since(lastLogTime) >= SyncErrorLogInterval {
|
||||
db.Logger.Error("sync error",
|
||||
"error", err,
|
||||
"consecutive_errors", consecutiveErrs,
|
||||
"backoff", backoff)
|
||||
lastLogTime = time.Now()
|
||||
}
|
||||
|
||||
// Try to clean up stale temp files after persistent disk errors.
|
||||
if isDiskFullError(err) && consecutiveErrs >= 3 {
|
||||
db.Logger.Warn("attempting temp file cleanup due to persistent disk errors")
|
||||
if cleanupErr := removeTmpFiles(db.metaPath); cleanupErr != nil {
|
||||
db.Logger.Error("temp file cleanup failed", "error", cleanupErr)
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Success - reset backoff and error counter.
|
||||
if consecutiveErrs > 0 {
|
||||
db.Logger.Info("sync recovered", "previous_errors", consecutiveErrs)
|
||||
}
|
||||
backoff = 0
|
||||
consecutiveErrs = 0
|
||||
}
|
||||
}
|
||||
|
||||
// CRC64 returns a CRC-64 ISO checksum of the database and its current position.
|
||||
|
||||
@@ -1058,3 +1058,106 @@ func TestDB_Monitor_DetectsSaltChangeAfterRestart(t *testing.T) {
|
||||
t.Log("Monitor correctly detected changes after WAL salt reset")
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsDiskFullError tests the disk full error detection helper.
|
||||
func TestIsDiskFullError(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "nil error",
|
||||
err: nil,
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "no space left on device",
|
||||
err: errors.New("write /tmp/file: no space left on device"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "No Space Left On Device (uppercase)",
|
||||
err: errors.New("No Space Left On Device"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "disk quota exceeded",
|
||||
err: errors.New("write: disk quota exceeded"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "ENOSPC",
|
||||
err: errors.New("ENOSPC: cannot write file"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "EDQUOT",
|
||||
err: errors.New("error EDQUOT while writing"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "regular error",
|
||||
err: errors.New("connection refused"),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "permission denied",
|
||||
err: errors.New("permission denied"),
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "wrapped disk full error",
|
||||
err: fmt.Errorf("sync failed: %w", errors.New("no space left on device")),
|
||||
expected: true,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := isDiskFullError(tt.err)
|
||||
if result != tt.expected {
|
||||
t.Errorf("isDiskFullError(%v) = %v, want %v", tt.err, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// TestIsSQLiteBusyError tests the SQLite busy error detection helper.
|
||||
func TestIsSQLiteBusyError(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
err error
|
||||
expected bool
|
||||
}{
|
||||
{
|
||||
name: "nil error",
|
||||
err: nil,
|
||||
expected: false,
|
||||
},
|
||||
{
|
||||
name: "database is locked",
|
||||
err: errors.New("database is locked"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "SQLITE_BUSY",
|
||||
err: errors.New("SQLITE_BUSY: cannot commit"),
|
||||
expected: true,
|
||||
},
|
||||
{
|
||||
name: "regular error",
|
||||
err: errors.New("connection refused"),
|
||||
expected: false,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
result := isSQLiteBusyError(tt.err)
|
||||
if result != tt.expected {
|
||||
t.Errorf("isSQLiteBusyError(%v) = %v, want %v", tt.err, result, tt.expected)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
43
replica.go
43
replica.go
@@ -303,6 +303,8 @@ func (r *Replica) deleteBeforeTXID(ctx context.Context, level int, txID ltx.TXID
|
||||
*/
|
||||
|
||||
// monitor runs in a separate goroutine and continuously replicates the DB.
|
||||
// Implements exponential backoff on repeated sync errors to prevent log spam
|
||||
// and reduce load when persistent errors occur. See issue #927.
|
||||
func (r *Replica) monitor(ctx context.Context) {
|
||||
ticker := time.NewTicker(r.SyncInterval)
|
||||
defer ticker.Stop()
|
||||
@@ -312,6 +314,10 @@ func (r *Replica) monitor(ctx context.Context) {
|
||||
close(ch)
|
||||
var notify <-chan struct{} = ch
|
||||
|
||||
var backoff time.Duration
|
||||
var lastLogTime time.Time
|
||||
var consecutiveErrs int
|
||||
|
||||
for initial := true; ; initial = false {
|
||||
// Enforce a minimum time between synchronization.
|
||||
if !initial {
|
||||
@@ -322,6 +328,15 @@ func (r *Replica) monitor(ctx context.Context) {
|
||||
}
|
||||
}
|
||||
|
||||
// If in backoff mode, wait additional time before retrying.
|
||||
if backoff > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
}
|
||||
|
||||
// Wait for changes to the database.
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
@@ -336,10 +351,36 @@ func (r *Replica) monitor(ctx context.Context) {
|
||||
if err := r.Sync(ctx); err != nil {
|
||||
// Don't log context cancellation errors during shutdown
|
||||
if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
|
||||
r.Logger().Error("monitor error", "error", err)
|
||||
consecutiveErrs++
|
||||
|
||||
// Exponential backoff: SyncInterval -> 2x -> 4x -> ... -> max
|
||||
if backoff == 0 {
|
||||
backoff = r.SyncInterval
|
||||
} else {
|
||||
backoff *= 2
|
||||
if backoff > DefaultSyncBackoffMax {
|
||||
backoff = DefaultSyncBackoffMax
|
||||
}
|
||||
}
|
||||
|
||||
// Log with rate limiting to avoid log spam during persistent errors.
|
||||
if time.Since(lastLogTime) >= SyncErrorLogInterval {
|
||||
r.Logger().Error("monitor error",
|
||||
"error", err,
|
||||
"consecutive_errors", consecutiveErrs,
|
||||
"backoff", backoff)
|
||||
lastLogTime = time.Now()
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Success - reset backoff and error counter.
|
||||
if consecutiveErrs > 0 {
|
||||
r.Logger().Info("replica sync recovered", "previous_errors", consecutiveErrs)
|
||||
}
|
||||
backoff = 0
|
||||
consecutiveErrs = 0
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user