fix(sync): add exponential backoff for error resilience (#931)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Cory LaNou
2026-01-06 13:30:48 -06:00
committed by GitHub
parent 9c39382a3c
commit d24abaf707
3 changed files with 216 additions and 2 deletions

72
db.go
View File

@@ -35,6 +35,11 @@ const (
DefaultTruncatePageN = 121359 // ~500MB with 4KB page size
DefaultShutdownSyncTimeout = 30 * time.Second
DefaultShutdownSyncInterval = 500 * time.Millisecond
// Sync error backoff configuration.
// When sync errors occur repeatedly (e.g., disk full), backoff doubles each time.
DefaultSyncBackoffMax = 5 * time.Minute // Maximum backoff between retries
SyncErrorLogInterval = 30 * time.Second // Rate-limit repeated error logging
)
// DB represents a managed instance of a SQLite database in the file system.
@@ -853,6 +858,19 @@ func isSQLiteBusyError(err error) bool {
strings.Contains(errStr, "SQLITE_BUSY")
}
// isDiskFullError returns true if the error indicates disk space issues.
// This includes "no space left on device" (ENOSPC) and "disk quota exceeded" (EDQUOT).
func isDiskFullError(err error) bool {
if err == nil {
return false
}
errStr := strings.ToLower(err.Error())
return strings.Contains(errStr, "no space left on device") ||
strings.Contains(errStr, "disk quota exceeded") ||
strings.Contains(errStr, "enospc") ||
strings.Contains(errStr, "edquot")
}
// walFileSize returns the size of the WAL file in bytes.
func (db *DB) walFileSize() (int64, error) {
fi, err := os.Stat(db.WALPath())
@@ -1993,6 +2011,9 @@ func (db *DB) EnforceRetentionByTXID(ctx context.Context, level int, txID ltx.TX
// When a change is detected, we call Sync() which performs full verification
// and replication. The cost per tick is just one stat() call plus a 32-byte
// read, which is negligible.
//
// Implements exponential backoff on repeated sync errors to prevent disk churn
// when persistent errors (like disk full) occur. See issue #927.
func (db *DB) monitor() {
ticker := time.NewTicker(db.MonitorInterval)
defer ticker.Stop()
@@ -2001,6 +2022,11 @@ func (db *DB) monitor() {
var lastWALSize int64
var lastWALHeader []byte
// Backoff state for error handling.
var backoff time.Duration
var lastLogTime time.Time
var consecutiveErrs int
for {
// Wait for ticker or context close.
select {
@@ -2009,6 +2035,15 @@ func (db *DB) monitor() {
case <-ticker.C:
}
// If in backoff mode, wait additional time before retrying.
if backoff > 0 {
select {
case <-db.ctx.Done():
return
case <-time.After(backoff):
}
}
// Check if WAL has changed before doing expensive sync.
walPath := db.WALPath()
fi, err := os.Stat(walPath)
@@ -2045,9 +2080,44 @@ func (db *DB) monitor() {
lastWALHeader = walHeader
if err := db.Sync(db.ctx); err != nil && !errors.Is(err, context.Canceled) {
db.Logger.Error("sync error", "error", err)
consecutiveErrs++
// Exponential backoff: MonitorInterval -> 2x -> 4x -> ... -> max
if backoff == 0 {
backoff = db.MonitorInterval
} else {
backoff *= 2
if backoff > DefaultSyncBackoffMax {
backoff = DefaultSyncBackoffMax
}
}
// Log with rate limiting to avoid log spam during persistent errors.
if time.Since(lastLogTime) >= SyncErrorLogInterval {
db.Logger.Error("sync error",
"error", err,
"consecutive_errors", consecutiveErrs,
"backoff", backoff)
lastLogTime = time.Now()
}
// Try to clean up stale temp files after persistent disk errors.
if isDiskFullError(err) && consecutiveErrs >= 3 {
db.Logger.Warn("attempting temp file cleanup due to persistent disk errors")
if cleanupErr := removeTmpFiles(db.metaPath); cleanupErr != nil {
db.Logger.Error("temp file cleanup failed", "error", cleanupErr)
}
}
continue
}
// Success - reset backoff and error counter.
if consecutiveErrs > 0 {
db.Logger.Info("sync recovered", "previous_errors", consecutiveErrs)
}
backoff = 0
consecutiveErrs = 0
}
}
// CRC64 returns a CRC-64 ISO checksum of the database and its current position.

View File

@@ -1058,3 +1058,106 @@ func TestDB_Monitor_DetectsSaltChangeAfterRestart(t *testing.T) {
t.Log("Monitor correctly detected changes after WAL salt reset")
}
}
// TestIsDiskFullError tests the disk full error detection helper.
func TestIsDiskFullError(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{
name: "nil error",
err: nil,
expected: false,
},
{
name: "no space left on device",
err: errors.New("write /tmp/file: no space left on device"),
expected: true,
},
{
name: "No Space Left On Device (uppercase)",
err: errors.New("No Space Left On Device"),
expected: true,
},
{
name: "disk quota exceeded",
err: errors.New("write: disk quota exceeded"),
expected: true,
},
{
name: "ENOSPC",
err: errors.New("ENOSPC: cannot write file"),
expected: true,
},
{
name: "EDQUOT",
err: errors.New("error EDQUOT while writing"),
expected: true,
},
{
name: "regular error",
err: errors.New("connection refused"),
expected: false,
},
{
name: "permission denied",
err: errors.New("permission denied"),
expected: false,
},
{
name: "wrapped disk full error",
err: fmt.Errorf("sync failed: %w", errors.New("no space left on device")),
expected: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isDiskFullError(tt.err)
if result != tt.expected {
t.Errorf("isDiskFullError(%v) = %v, want %v", tt.err, result, tt.expected)
}
})
}
}
// TestIsSQLiteBusyError tests the SQLite busy error detection helper.
func TestIsSQLiteBusyError(t *testing.T) {
tests := []struct {
name string
err error
expected bool
}{
{
name: "nil error",
err: nil,
expected: false,
},
{
name: "database is locked",
err: errors.New("database is locked"),
expected: true,
},
{
name: "SQLITE_BUSY",
err: errors.New("SQLITE_BUSY: cannot commit"),
expected: true,
},
{
name: "regular error",
err: errors.New("connection refused"),
expected: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := isSQLiteBusyError(tt.err)
if result != tt.expected {
t.Errorf("isSQLiteBusyError(%v) = %v, want %v", tt.err, result, tt.expected)
}
})
}
}

View File

@@ -303,6 +303,8 @@ func (r *Replica) deleteBeforeTXID(ctx context.Context, level int, txID ltx.TXID
*/
// monitor runs in a separate goroutine and continuously replicates the DB.
// Implements exponential backoff on repeated sync errors to prevent log spam
// and reduce load when persistent errors occur. See issue #927.
func (r *Replica) monitor(ctx context.Context) {
ticker := time.NewTicker(r.SyncInterval)
defer ticker.Stop()
@@ -312,6 +314,10 @@ func (r *Replica) monitor(ctx context.Context) {
close(ch)
var notify <-chan struct{} = ch
var backoff time.Duration
var lastLogTime time.Time
var consecutiveErrs int
for initial := true; ; initial = false {
// Enforce a minimum time between synchronization.
if !initial {
@@ -322,6 +328,15 @@ func (r *Replica) monitor(ctx context.Context) {
}
}
// If in backoff mode, wait additional time before retrying.
if backoff > 0 {
select {
case <-ctx.Done():
return
case <-time.After(backoff):
}
}
// Wait for changes to the database.
select {
case <-ctx.Done():
@@ -336,10 +351,36 @@ func (r *Replica) monitor(ctx context.Context) {
if err := r.Sync(ctx); err != nil {
// Don't log context cancellation errors during shutdown
if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
r.Logger().Error("monitor error", "error", err)
consecutiveErrs++
// Exponential backoff: SyncInterval -> 2x -> 4x -> ... -> max
if backoff == 0 {
backoff = r.SyncInterval
} else {
backoff *= 2
if backoff > DefaultSyncBackoffMax {
backoff = DefaultSyncBackoffMax
}
}
// Log with rate limiting to avoid log spam during persistent errors.
if time.Since(lastLogTime) >= SyncErrorLogInterval {
r.Logger().Error("monitor error",
"error", err,
"consecutive_errors", consecutiveErrs,
"backoff", backoff)
lastLogTime = time.Now()
}
}
continue
}
// Success - reset backoff and error counter.
if consecutiveErrs > 0 {
r.Logger().Info("replica sync recovered", "previous_errors", consecutiveErrs)
}
backoff = 0
consecutiveErrs = 0
}
}