Files
beads/cmd/bd/daemon_sync_state.go
Steven Syrek ba832c4f37 fix(daemon): Eliminate race condition in sync state load-modify-save (#1139)
The RecordSyncFailure and ResetBackoffOnDaemonStart functions had a
TOCTOU (time-of-check-time-of-use) race condition. They called
LoadSyncState (which locks, reads, unlocks) then modified the state,
then called SaveSyncState (which locks, writes, unlocks).

Between LoadSyncState returning and SaveSyncState being called, another
goroutine could load the old state, modify it, and save it - then this
goroutine's save would overwrite those changes.

Fix: Create internal unlocked helper functions (loadSyncStateUnlocked,
saveSyncStateUnlocked) and have RecordSyncFailure and
ResetBackoffOnDaemonStart hold the lock for the entire load-modify-save
operation.

Co-authored-by: Steven Syrek <steven.syrek@deepl.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-19 10:08:41 -08:00

184 lines
4.9 KiB
Go

package main
import (
"encoding/json"
"os"
"path/filepath"
"sync"
"time"
)
// SyncState tracks daemon sync health for backoff and user hints.
// Stored in .beads/sync-state.json (gitignored, local-only).
type SyncState struct {
LastFailure time.Time `json:"last_failure,omitempty"`
FailureCount int `json:"failure_count"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
NeedsManualSync bool `json:"needs_manual_sync"`
FailureReason string `json:"failure_reason,omitempty"`
}
const (
syncStateFile = "sync-state.json"
// Backoff schedule: 30s, 1m, 2m, 5m, 10m, 30m (cap)
maxBackoffDuration = 30 * time.Minute
// Clear stale state after 24 hours
staleStateThreshold = 24 * time.Hour
)
var (
// backoffSchedule defines the exponential backoff durations
backoffSchedule = []time.Duration{
30 * time.Second,
1 * time.Minute,
2 * time.Minute,
5 * time.Minute,
10 * time.Minute,
30 * time.Minute,
}
// syncStateMu protects concurrent access to sync state file
syncStateMu sync.Mutex
)
// LoadSyncState loads the sync state from .beads/sync-state.json.
// Returns empty state if file doesn't exist or is stale.
func LoadSyncState(beadsDir string) SyncState {
syncStateMu.Lock()
defer syncStateMu.Unlock()
return loadSyncStateUnlocked(beadsDir)
}
// loadSyncStateUnlocked loads sync state without acquiring lock.
// Caller must hold syncStateMu.
func loadSyncStateUnlocked(beadsDir string) SyncState {
statePath := filepath.Join(beadsDir, syncStateFile)
data, err := os.ReadFile(statePath) // #nosec G304 - path constructed from beadsDir
if err != nil {
return SyncState{}
}
var state SyncState
if err := json.Unmarshal(data, &state); err != nil {
return SyncState{}
}
// Clear stale state (older than 24h with no recent failures)
if !state.LastFailure.IsZero() && time.Since(state.LastFailure) > staleStateThreshold {
_ = os.Remove(statePath)
return SyncState{}
}
return state
}
// SaveSyncState saves the sync state to .beads/sync-state.json.
func SaveSyncState(beadsDir string, state SyncState) error {
syncStateMu.Lock()
defer syncStateMu.Unlock()
return saveSyncStateUnlocked(beadsDir, state)
}
// saveSyncStateUnlocked saves sync state without acquiring lock.
// Caller must hold syncStateMu.
func saveSyncStateUnlocked(beadsDir string, state SyncState) error {
statePath := filepath.Join(beadsDir, syncStateFile)
// If state is empty/reset, remove the file
if state.FailureCount == 0 && !state.NeedsManualSync {
_ = os.Remove(statePath)
return nil
}
data, err := json.MarshalIndent(state, "", " ")
if err != nil {
return err
}
return os.WriteFile(statePath, data, 0600)
}
// ClearSyncState removes the sync state file.
func ClearSyncState(beadsDir string) error {
syncStateMu.Lock()
defer syncStateMu.Unlock()
statePath := filepath.Join(beadsDir, syncStateFile)
err := os.Remove(statePath)
if os.IsNotExist(err) {
return nil
}
return err
}
// RecordSyncFailure updates the sync state after a failure.
// Returns the duration until next retry.
// Thread-safe: holds lock for entire load-modify-save operation to prevent races.
func RecordSyncFailure(beadsDir string, reason string) time.Duration {
syncStateMu.Lock()
defer syncStateMu.Unlock()
state := loadSyncStateUnlocked(beadsDir)
state.LastFailure = time.Now()
state.FailureCount++
state.FailureReason = reason
// Calculate backoff duration
backoffIndex := state.FailureCount - 1
if backoffIndex >= len(backoffSchedule) {
backoffIndex = len(backoffSchedule) - 1
}
backoff := backoffSchedule[backoffIndex]
state.BackoffUntil = time.Now().Add(backoff)
// Mark as needing manual sync after 3 failures (likely a conflict)
if state.FailureCount >= 3 {
state.NeedsManualSync = true
}
_ = saveSyncStateUnlocked(beadsDir, state)
return backoff
}
// RecordSyncSuccess clears the sync state after a successful sync.
func RecordSyncSuccess(beadsDir string) {
_ = ClearSyncState(beadsDir)
}
// ShouldSkipSync returns true if we're still in the backoff period.
func ShouldSkipSync(beadsDir string) bool {
state := LoadSyncState(beadsDir)
if state.BackoffUntil.IsZero() {
return false
}
return time.Now().Before(state.BackoffUntil)
}
// ResetBackoffOnDaemonStart resets backoff counters when daemon starts,
// but preserves NeedsManualSync flag so hints still show.
// This allows a fresh start while keeping user informed of conflicts.
// Thread-safe: holds lock for entire load-modify-save operation to prevent races.
func ResetBackoffOnDaemonStart(beadsDir string) {
syncStateMu.Lock()
defer syncStateMu.Unlock()
state := loadSyncStateUnlocked(beadsDir)
// Nothing to reset
if state.FailureCount == 0 && !state.NeedsManualSync {
return
}
// Reset backoff but preserve NeedsManualSync
needsManual := state.NeedsManualSync
reason := state.FailureReason
state = SyncState{
NeedsManualSync: needsManual,
FailureReason: reason,
}
_ = saveSyncStateUnlocked(beadsDir, state)
}