fix(sync): add exponential backoff for error resilience (#931)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-25 05:06:30 +00:00 · 2026-01-06 13:30:48 -06:00
parent 9c39382a3c
commit d24abaf707
3 changed files with 216 additions and 2 deletions
--- a/db.go
+++ b/db.go
@@ -35,6 +35,11 @@ const (
 	DefaultTruncatePageN        = 121359 // ~500MB with 4KB page size
 	DefaultShutdownSyncTimeout  = 30 * time.Second
 	DefaultShutdownSyncInterval = 500 * time.Millisecond
+
+	// Sync error backoff configuration.
+	// When sync errors occur repeatedly (e.g., disk full), backoff doubles each time.
+	DefaultSyncBackoffMax = 5 * time.Minute  // Maximum backoff between retries
+	SyncErrorLogInterval  = 30 * time.Second // Rate-limit repeated error logging
 )

 // DB represents a managed instance of a SQLite database in the file system.
@@ -853,6 +858,19 @@ func isSQLiteBusyError(err error) bool {
 		strings.Contains(errStr, "SQLITE_BUSY")
 }

+// isDiskFullError returns true if the error indicates disk space issues.
+// This includes "no space left on device" (ENOSPC) and "disk quota exceeded" (EDQUOT).
+func isDiskFullError(err error) bool {
+	if err == nil {
+		return false
+	}
+	errStr := strings.ToLower(err.Error())
+	return strings.Contains(errStr, "no space left on device") ||
+		strings.Contains(errStr, "disk quota exceeded") ||
+		strings.Contains(errStr, "enospc") ||
+		strings.Contains(errStr, "edquot")
+}
+
 // walFileSize returns the size of the WAL file in bytes.
 func (db *DB) walFileSize() (int64, error) {
 	fi, err := os.Stat(db.WALPath())
@@ -1993,6 +2011,9 @@ func (db *DB) EnforceRetentionByTXID(ctx context.Context, level int, txID ltx.TX
 // When a change is detected, we call Sync() which performs full verification
 // and replication. The cost per tick is just one stat() call plus a 32-byte
 // read, which is negligible.
+//
+// Implements exponential backoff on repeated sync errors to prevent disk churn
+// when persistent errors (like disk full) occur. See issue #927.
 func (db *DB) monitor() {
 	ticker := time.NewTicker(db.MonitorInterval)
 	defer ticker.Stop()
@@ -2001,6 +2022,11 @@ func (db *DB) monitor() {
 	var lastWALSize int64
 	var lastWALHeader []byte

+	// Backoff state for error handling.
+	var backoff time.Duration
+	var lastLogTime time.Time
+	var consecutiveErrs int
+
 	for {
 		// Wait for ticker or context close.
 		select {
@@ -2009,6 +2035,15 @@ func (db *DB) monitor() {
 		case <-ticker.C:
 		}

+		// If in backoff mode, wait additional time before retrying.
+		if backoff > 0 {
+			select {
+			case <-db.ctx.Done():
+				return
+			case <-time.After(backoff):
+			}
+		}
+
 		// Check if WAL has changed before doing expensive sync.
 		walPath := db.WALPath()
 		fi, err := os.Stat(walPath)
@@ -2045,9 +2080,44 @@ func (db *DB) monitor() {
 		lastWALHeader = walHeader

 		if err := db.Sync(db.ctx); err != nil && !errors.Is(err, context.Canceled) {
-			db.Logger.Error("sync error", "error", err)
+			consecutiveErrs++
+
+			// Exponential backoff: MonitorInterval -> 2x -> 4x -> ... -> max
+			if backoff == 0 {
+				backoff = db.MonitorInterval
+			} else {
+				backoff *= 2
+				if backoff > DefaultSyncBackoffMax {
+					backoff = DefaultSyncBackoffMax
 				}
 			}
+
+			// Log with rate limiting to avoid log spam during persistent errors.
+			if time.Since(lastLogTime) >= SyncErrorLogInterval {
+				db.Logger.Error("sync error",
+					"error", err,
+					"consecutive_errors", consecutiveErrs,
+					"backoff", backoff)
+				lastLogTime = time.Now()
+			}
+
+			// Try to clean up stale temp files after persistent disk errors.
+			if isDiskFullError(err) && consecutiveErrs >= 3 {
+				db.Logger.Warn("attempting temp file cleanup due to persistent disk errors")
+				if cleanupErr := removeTmpFiles(db.metaPath); cleanupErr != nil {
+					db.Logger.Error("temp file cleanup failed", "error", cleanupErr)
+				}
+			}
+			continue
+		}
+
+		// Success - reset backoff and error counter.
+		if consecutiveErrs > 0 {
+			db.Logger.Info("sync recovered", "previous_errors", consecutiveErrs)
+		}
+		backoff = 0
+		consecutiveErrs = 0
+	}
 }

 // CRC64 returns a CRC-64 ISO checksum of the database and its current position.
--- a/db_internal_test.go
+++ b/db_internal_test.go
@@ -1058,3 +1058,106 @@ func TestDB_Monitor_DetectsSaltChangeAfterRestart(t *testing.T) {
 		t.Log("Monitor correctly detected changes after WAL salt reset")
 	}
 }
+
+// TestIsDiskFullError tests the disk full error detection helper.
+func TestIsDiskFullError(t *testing.T) {
+	tests := []struct {
+		name     string
+		err      error
+		expected bool
+	}{
+		{
+			name:     "nil error",
+			err:      nil,
+			expected: false,
+		},
+		{
+			name:     "no space left on device",
+			err:      errors.New("write /tmp/file: no space left on device"),
+			expected: true,
+		},
+		{
+			name:     "No Space Left On Device (uppercase)",
+			err:      errors.New("No Space Left On Device"),
+			expected: true,
+		},
+		{
+			name:     "disk quota exceeded",
+			err:      errors.New("write: disk quota exceeded"),
+			expected: true,
+		},
+		{
+			name:     "ENOSPC",
+			err:      errors.New("ENOSPC: cannot write file"),
+			expected: true,
+		},
+		{
+			name:     "EDQUOT",
+			err:      errors.New("error EDQUOT while writing"),
+			expected: true,
+		},
+		{
+			name:     "regular error",
+			err:      errors.New("connection refused"),
+			expected: false,
+		},
+		{
+			name:     "permission denied",
+			err:      errors.New("permission denied"),
+			expected: false,
+		},
+		{
+			name:     "wrapped disk full error",
+			err:      fmt.Errorf("sync failed: %w", errors.New("no space left on device")),
+			expected: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := isDiskFullError(tt.err)
+			if result != tt.expected {
+				t.Errorf("isDiskFullError(%v) = %v, want %v", tt.err, result, tt.expected)
+			}
+		})
+	}
+}
+
+// TestIsSQLiteBusyError tests the SQLite busy error detection helper.
+func TestIsSQLiteBusyError(t *testing.T) {
+	tests := []struct {
+		name     string
+		err      error
+		expected bool
+	}{
+		{
+			name:     "nil error",
+			err:      nil,
+			expected: false,
+		},
+		{
+			name:     "database is locked",
+			err:      errors.New("database is locked"),
+			expected: true,
+		},
+		{
+			name:     "SQLITE_BUSY",
+			err:      errors.New("SQLITE_BUSY: cannot commit"),
+			expected: true,
+		},
+		{
+			name:     "regular error",
+			err:      errors.New("connection refused"),
+			expected: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := isSQLiteBusyError(tt.err)
+			if result != tt.expected {
+				t.Errorf("isSQLiteBusyError(%v) = %v, want %v", tt.err, result, tt.expected)
+			}
+		})
+	}
+}
--- a/replica.go
+++ b/replica.go
@@ -303,6 +303,8 @@ func (r *Replica) deleteBeforeTXID(ctx context.Context, level int, txID ltx.TXID
 */

 // monitor runs in a separate goroutine and continuously replicates the DB.
+// Implements exponential backoff on repeated sync errors to prevent log spam
+// and reduce load when persistent errors occur. See issue #927.
 func (r *Replica) monitor(ctx context.Context) {
 	ticker := time.NewTicker(r.SyncInterval)
 	defer ticker.Stop()
@@ -312,6 +314,10 @@ func (r *Replica) monitor(ctx context.Context) {
 	close(ch)
 	var notify <-chan struct{} = ch

+	var backoff time.Duration
+	var lastLogTime time.Time
+	var consecutiveErrs int
+
 	for initial := true; ; initial = false {
 		// Enforce a minimum time between synchronization.
 		if !initial {
@@ -322,6 +328,15 @@ func (r *Replica) monitor(ctx context.Context) {
 			}
 		}

+		// If in backoff mode, wait additional time before retrying.
+		if backoff > 0 {
+			select {
+			case <-ctx.Done():
+				return
+			case <-time.After(backoff):
+			}
+		}
+
 		// Wait for changes to the database.
 		select {
 		case <-ctx.Done():
@@ -336,10 +351,36 @@ func (r *Replica) monitor(ctx context.Context) {
 		if err := r.Sync(ctx); err != nil {
 			// Don't log context cancellation errors during shutdown
 			if !errors.Is(err, context.Canceled) && !errors.Is(err, context.DeadlineExceeded) {
-				r.Logger().Error("monitor error", "error", err)
+				consecutiveErrs++
+
+				// Exponential backoff: SyncInterval -> 2x -> 4x -> ... -> max
+				if backoff == 0 {
+					backoff = r.SyncInterval
+				} else {
+					backoff *= 2
+					if backoff > DefaultSyncBackoffMax {
+						backoff = DefaultSyncBackoffMax
+					}
+				}
+
+				// Log with rate limiting to avoid log spam during persistent errors.
+				if time.Since(lastLogTime) >= SyncErrorLogInterval {
+					r.Logger().Error("monitor error",
+						"error", err,
+						"consecutive_errors", consecutiveErrs,
+						"backoff", backoff)
+					lastLogTime = time.Now()
+				}
 			}
 			continue
 		}
+
+		// Success - reset backoff and error counter.
+		if consecutiveErrs > 0 {
+			r.Logger().Info("replica sync recovered", "previous_errors", consecutiveErrs)
+		}
+		backoff = 0
+		consecutiveErrs = 0
 	}
 }