From b51c0101ba670850151bd90226b4ce90c06a1a27 Mon Sep 17 00:00:00 2001 From: Steven Syrek Date: Fri, 16 Jan 2026 23:36:59 +0100 Subject: [PATCH] Fix stack overflow in acquireStartLock due to infinite recursion (#1131) The acquireStartLock function would recursively call itself after attempting to remove a stale lock file. If os.Remove failed (due to permissions, race conditions, etc.), the error was silently ignored with `_`, causing infinite recursion until the 1GB stack limit was exceeded. Changes: - Convert recursive calls to a bounded retry loop (max 3 attempts) - Check removeFileFn return value before retrying - Apply same fix to handleStaleLock which had the same issue - Add test to verify function returns false when remove fails Fixes the stack overflow crash that occurred when running any bd command with a stale or problematic lock file. Co-authored-by: Steven Syrek Co-authored-by: Claude Opus 4.5 --- cmd/bd/daemon_autostart.go | 47 ++++++++++++++++++++-------- cmd/bd/daemon_autostart_unit_test.go | 29 +++++++++++++++++ 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/cmd/bd/daemon_autostart.go b/cmd/bd/daemon_autostart.go index 213b1618..f9b5d225 100644 --- a/cmd/bd/daemon_autostart.go +++ b/cmd/bd/daemon_autostart.go @@ -220,17 +220,29 @@ func acquireStartLock(lockPath, socketPath string) bool { return false } - // nolint:gosec // G304: lockPath is derived from secure beads directory - lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600) - if err != nil { + // Bounded retry loop to prevent infinite recursion when lock cleanup fails + const maxRetries = 3 + for attempt := 0; attempt < maxRetries; attempt++ { + // nolint:gosec // G304: lockPath is derived from secure beads directory + lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600) + if err == nil { + // Successfully acquired lock + _, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid()) + _ = lockFile.Close() // Best-effort close during startup + return true + } + // Lock file exists - check if daemon is actually starting lockPID, pidErr := readPIDFromFile(lockPath) if pidErr != nil || !isPIDAlive(lockPID) { // Stale lock from crashed process - clean up immediately (avoids 5s wait) debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID) - _ = os.Remove(lockPath) - // Retry lock acquisition after cleanup - return acquireStartLock(lockPath, socketPath) + if rmErr := removeFileFn(lockPath); rmErr != nil { + debugLog("failed to remove stale lock file: %v", rmErr) + return false // Can't acquire lock if we can't clean up + } + // Continue to next iteration to retry lock acquisition + continue } // PID is alive - but is daemon actually running/starting? @@ -239,8 +251,12 @@ func acquireStartLock(lockPath, socketPath string) bool { if running, _ := lockfile.TryDaemonLock(beadsDir); !running { // Daemon lock not held - the start attempt failed or process was reused debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID) - _ = os.Remove(lockPath) - return acquireStartLock(lockPath, socketPath) + if rmErr := removeFileFn(lockPath); rmErr != nil { + debugLog("failed to remove orphaned lock file: %v", rmErr) + return false // Can't acquire lock if we can't clean up + } + // Continue to next iteration to retry lock acquisition + continue } // Daemon lock is held - daemon is legitimately starting, wait for socket @@ -251,9 +267,8 @@ func acquireStartLock(lockPath, socketPath string) bool { return handleStaleLock(lockPath, socketPath) } - _, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid()) - _ = lockFile.Close() // Best-effort close during startup - return true + debugLog("failed to acquire start lock after %d attempts", maxRetries) + return false } func handleStaleLock(lockPath, socketPath string) bool { @@ -262,7 +277,10 @@ func handleStaleLock(lockPath, socketPath string) bool { // Check if PID is dead if err != nil || !isPIDAlive(lockPID) { debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID) - _ = os.Remove(lockPath) + if rmErr := removeFileFn(lockPath); rmErr != nil { + debugLog("failed to remove stale lock in handleStaleLock: %v", rmErr) + return false + } return tryAutoStartDaemon(socketPath) } @@ -270,7 +288,10 @@ func handleStaleLock(lockPath, socketPath string) bool { beadsDir := filepath.Dir(dbPath) if running, _ := lockfile.TryDaemonLock(beadsDir); !running { debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID) - _ = os.Remove(lockPath) + if rmErr := removeFileFn(lockPath); rmErr != nil { + debugLog("failed to remove orphaned lock in handleStaleLock: %v", rmErr) + return false + } return tryAutoStartDaemon(socketPath) } diff --git a/cmd/bd/daemon_autostart_unit_test.go b/cmd/bd/daemon_autostart_unit_test.go index f9ec9e35..83e1ad9f 100644 --- a/cmd/bd/daemon_autostart_unit_test.go +++ b/cmd/bd/daemon_autostart_unit_test.go @@ -145,6 +145,35 @@ func TestDaemonAutostart_AcquireStartLock_CreatesMissingDir(t *testing.T) { } } +func TestDaemonAutostart_AcquireStartLock_FailsWhenRemoveFails(t *testing.T) { + // This test verifies that acquireStartLock returns false (instead of + // recursing infinitely) when os.Remove fails on a stale lock file. + // See: https://github.com/steveyegge/beads/issues/XXX + + oldRemove := removeFileFn + defer func() { removeFileFn = oldRemove }() + + // Stub removeFileFn to always fail + removeFileFn = func(path string) error { + return os.ErrPermission + } + + tmpDir := t.TempDir() + lockPath := filepath.Join(tmpDir, "bd.sock.startlock") + socketPath := filepath.Join(tmpDir, "bd.sock") + + // Create a stale lock file with PID 0 (will be detected as dead) + if err := os.WriteFile(lockPath, []byte("0\n"), 0o600); err != nil { + t.Fatalf("WriteFile: %v", err) + } + + // acquireStartLock should return false since it can't remove the stale lock + // Previously, this would cause infinite recursion and stack overflow + if acquireStartLock(lockPath, socketPath) { + t.Fatalf("expected acquireStartLock to fail when remove fails") + } +} + func TestDaemonAutostart_SocketHealthAndReadiness(t *testing.T) { socketPath, cleanup := startTestRPCServer(t) defer cleanup()