From b51c0101ba670850151bd90226b4ce90c06a1a27 Mon Sep 17 00:00:00 2001
From: Steven Syrek <sjsyrek@users.noreply.github.com>
Date: Fri, 16 Jan 2026 23:36:59 +0100
Subject: [PATCH] Fix stack overflow in acquireStartLock due to infinite
 recursion (#1131)

The acquireStartLock function would recursively call itself after
attempting to remove a stale lock file. If os.Remove failed (due to
permissions, race conditions, etc.), the error was silently ignored
with `_`, causing infinite recursion until the 1GB stack limit was
exceeded.

Changes:
- Convert recursive calls to a bounded retry loop (max 3 attempts)
- Check removeFileFn return value before retrying
- Apply same fix to handleStaleLock which had the same issue
- Add test to verify function returns false when remove fails

Fixes the stack overflow crash that occurred when running any bd
command with a stale or problematic lock file.

Co-authored-by: Steven Syrek <steven.syrek@deepl.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
---
 cmd/bd/daemon_autostart.go           | 47 ++++++++++++++++++++--------
 cmd/bd/daemon_autostart_unit_test.go | 29 +++++++++++++++++
 2 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/cmd/bd/daemon_autostart.go b/cmd/bd/daemon_autostart.go
index 213b1618..f9b5d225 100644
--- a/cmd/bd/daemon_autostart.go
+++ b/cmd/bd/daemon_autostart.go
@@ -220,17 +220,29 @@ func acquireStartLock(lockPath, socketPath string) bool {
 		return false
 	}
 
-	// nolint:gosec // G304: lockPath is derived from secure beads directory
-	lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
-	if err != nil {
+	// Bounded retry loop to prevent infinite recursion when lock cleanup fails
+	const maxRetries = 3
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		// nolint:gosec // G304: lockPath is derived from secure beads directory
+		lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
+		if err == nil {
+			// Successfully acquired lock
+			_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
+			_ = lockFile.Close() // Best-effort close during startup
+			return true
+		}
+
 		// Lock file exists - check if daemon is actually starting
 		lockPID, pidErr := readPIDFromFile(lockPath)
 		if pidErr != nil || !isPIDAlive(lockPID) {
 			// Stale lock from crashed process - clean up immediately (avoids 5s wait)
 			debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID)
-			_ = os.Remove(lockPath)
-			// Retry lock acquisition after cleanup
-			return acquireStartLock(lockPath, socketPath)
+			if rmErr := removeFileFn(lockPath); rmErr != nil {
+				debugLog("failed to remove stale lock file: %v", rmErr)
+				return false // Can't acquire lock if we can't clean up
+			}
+			// Continue to next iteration to retry lock acquisition
+			continue
 		}
 
 		// PID is alive - but is daemon actually running/starting?
@@ -239,8 +251,12 @@ func acquireStartLock(lockPath, socketPath string) bool {
 		if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
 			// Daemon lock not held - the start attempt failed or process was reused
 			debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
-			_ = os.Remove(lockPath)
-			return acquireStartLock(lockPath, socketPath)
+			if rmErr := removeFileFn(lockPath); rmErr != nil {
+				debugLog("failed to remove orphaned lock file: %v", rmErr)
+				return false // Can't acquire lock if we can't clean up
+			}
+			// Continue to next iteration to retry lock acquisition
+			continue
 		}
 
 		// Daemon lock is held - daemon is legitimately starting, wait for socket
@@ -251,9 +267,8 @@ func acquireStartLock(lockPath, socketPath string) bool {
 		return handleStaleLock(lockPath, socketPath)
 	}
 
-	_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
-	_ = lockFile.Close() // Best-effort close during startup
-	return true
+	debugLog("failed to acquire start lock after %d attempts", maxRetries)
+	return false
 }
 
 func handleStaleLock(lockPath, socketPath string) bool {
@@ -262,7 +277,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
 	// Check if PID is dead
 	if err != nil || !isPIDAlive(lockPID) {
 		debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
-		_ = os.Remove(lockPath)
+		if rmErr := removeFileFn(lockPath); rmErr != nil {
+			debugLog("failed to remove stale lock in handleStaleLock: %v", rmErr)
+			return false
+		}
 		return tryAutoStartDaemon(socketPath)
 	}
 
@@ -270,7 +288,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
 	beadsDir := filepath.Dir(dbPath)
 	if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
 		debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
-		_ = os.Remove(lockPath)
+		if rmErr := removeFileFn(lockPath); rmErr != nil {
+			debugLog("failed to remove orphaned lock in handleStaleLock: %v", rmErr)
+			return false
+		}
 		return tryAutoStartDaemon(socketPath)
 	}
 
diff --git a/cmd/bd/daemon_autostart_unit_test.go b/cmd/bd/daemon_autostart_unit_test.go
index f9ec9e35..83e1ad9f 100644
--- a/cmd/bd/daemon_autostart_unit_test.go
+++ b/cmd/bd/daemon_autostart_unit_test.go
@@ -145,6 +145,35 @@ func TestDaemonAutostart_AcquireStartLock_CreatesMissingDir(t *testing.T) {
 	}
 }
 
+func TestDaemonAutostart_AcquireStartLock_FailsWhenRemoveFails(t *testing.T) {
+	// This test verifies that acquireStartLock returns false (instead of
+	// recursing infinitely) when os.Remove fails on a stale lock file.
+	// See: https://github.com/steveyegge/beads/issues/XXX
+
+	oldRemove := removeFileFn
+	defer func() { removeFileFn = oldRemove }()
+
+	// Stub removeFileFn to always fail
+	removeFileFn = func(path string) error {
+		return os.ErrPermission
+	}
+
+	tmpDir := t.TempDir()
+	lockPath := filepath.Join(tmpDir, "bd.sock.startlock")
+	socketPath := filepath.Join(tmpDir, "bd.sock")
+
+	// Create a stale lock file with PID 0 (will be detected as dead)
+	if err := os.WriteFile(lockPath, []byte("0\n"), 0o600); err != nil {
+		t.Fatalf("WriteFile: %v", err)
+	}
+
+	// acquireStartLock should return false since it can't remove the stale lock
+	// Previously, this would cause infinite recursion and stack overflow
+	if acquireStartLock(lockPath, socketPath) {
+		t.Fatalf("expected acquireStartLock to fail when remove fails")
+	}
+}
+
 func TestDaemonAutostart_SocketHealthAndReadiness(t *testing.T) {
 	socketPath, cleanup := startTestRPCServer(t)
 	defer cleanup()