Fix stack overflow in acquireStartLock due to infinite recursion (#1131)
The acquireStartLock function would recursively call itself after attempting to remove a stale lock file. If os.Remove failed (due to permissions, race conditions, etc.), the error was silently ignored with `_`, causing infinite recursion until the 1GB stack limit was exceeded. Changes: - Convert recursive calls to a bounded retry loop (max 3 attempts) - Check removeFileFn return value before retrying - Apply same fix to handleStaleLock which had the same issue - Add test to verify function returns false when remove fails Fixes the stack overflow crash that occurred when running any bd command with a stale or problematic lock file. Co-authored-by: Steven Syrek <steven.syrek@deepl.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -220,17 +220,29 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
// nolint:gosec // G304: lockPath is derived from secure beads directory
|
// Bounded retry loop to prevent infinite recursion when lock cleanup fails
|
||||||
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
|
const maxRetries = 3
|
||||||
if err != nil {
|
for attempt := 0; attempt < maxRetries; attempt++ {
|
||||||
|
// nolint:gosec // G304: lockPath is derived from secure beads directory
|
||||||
|
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
|
||||||
|
if err == nil {
|
||||||
|
// Successfully acquired lock
|
||||||
|
_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
|
||||||
|
_ = lockFile.Close() // Best-effort close during startup
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
|
||||||
// Lock file exists - check if daemon is actually starting
|
// Lock file exists - check if daemon is actually starting
|
||||||
lockPID, pidErr := readPIDFromFile(lockPath)
|
lockPID, pidErr := readPIDFromFile(lockPath)
|
||||||
if pidErr != nil || !isPIDAlive(lockPID) {
|
if pidErr != nil || !isPIDAlive(lockPID) {
|
||||||
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
|
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
|
||||||
debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID)
|
debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID)
|
||||||
_ = os.Remove(lockPath)
|
if rmErr := removeFileFn(lockPath); rmErr != nil {
|
||||||
// Retry lock acquisition after cleanup
|
debugLog("failed to remove stale lock file: %v", rmErr)
|
||||||
return acquireStartLock(lockPath, socketPath)
|
return false // Can't acquire lock if we can't clean up
|
||||||
|
}
|
||||||
|
// Continue to next iteration to retry lock acquisition
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// PID is alive - but is daemon actually running/starting?
|
// PID is alive - but is daemon actually running/starting?
|
||||||
@@ -239,8 +251,12 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||||
// Daemon lock not held - the start attempt failed or process was reused
|
// Daemon lock not held - the start attempt failed or process was reused
|
||||||
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
|
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
|
||||||
_ = os.Remove(lockPath)
|
if rmErr := removeFileFn(lockPath); rmErr != nil {
|
||||||
return acquireStartLock(lockPath, socketPath)
|
debugLog("failed to remove orphaned lock file: %v", rmErr)
|
||||||
|
return false // Can't acquire lock if we can't clean up
|
||||||
|
}
|
||||||
|
// Continue to next iteration to retry lock acquisition
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Daemon lock is held - daemon is legitimately starting, wait for socket
|
// Daemon lock is held - daemon is legitimately starting, wait for socket
|
||||||
@@ -251,9 +267,8 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
return handleStaleLock(lockPath, socketPath)
|
return handleStaleLock(lockPath, socketPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
|
debugLog("failed to acquire start lock after %d attempts", maxRetries)
|
||||||
_ = lockFile.Close() // Best-effort close during startup
|
return false
|
||||||
return true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func handleStaleLock(lockPath, socketPath string) bool {
|
func handleStaleLock(lockPath, socketPath string) bool {
|
||||||
@@ -262,7 +277,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
|
|||||||
// Check if PID is dead
|
// Check if PID is dead
|
||||||
if err != nil || !isPIDAlive(lockPID) {
|
if err != nil || !isPIDAlive(lockPID) {
|
||||||
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
|
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
|
||||||
_ = os.Remove(lockPath)
|
if rmErr := removeFileFn(lockPath); rmErr != nil {
|
||||||
|
debugLog("failed to remove stale lock in handleStaleLock: %v", rmErr)
|
||||||
|
return false
|
||||||
|
}
|
||||||
return tryAutoStartDaemon(socketPath)
|
return tryAutoStartDaemon(socketPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -270,7 +288,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
|
|||||||
beadsDir := filepath.Dir(dbPath)
|
beadsDir := filepath.Dir(dbPath)
|
||||||
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||||
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
|
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
|
||||||
_ = os.Remove(lockPath)
|
if rmErr := removeFileFn(lockPath); rmErr != nil {
|
||||||
|
debugLog("failed to remove orphaned lock in handleStaleLock: %v", rmErr)
|
||||||
|
return false
|
||||||
|
}
|
||||||
return tryAutoStartDaemon(socketPath)
|
return tryAutoStartDaemon(socketPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -145,6 +145,35 @@ func TestDaemonAutostart_AcquireStartLock_CreatesMissingDir(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestDaemonAutostart_AcquireStartLock_FailsWhenRemoveFails(t *testing.T) {
|
||||||
|
// This test verifies that acquireStartLock returns false (instead of
|
||||||
|
// recursing infinitely) when os.Remove fails on a stale lock file.
|
||||||
|
// See: https://github.com/steveyegge/beads/issues/XXX
|
||||||
|
|
||||||
|
oldRemove := removeFileFn
|
||||||
|
defer func() { removeFileFn = oldRemove }()
|
||||||
|
|
||||||
|
// Stub removeFileFn to always fail
|
||||||
|
removeFileFn = func(path string) error {
|
||||||
|
return os.ErrPermission
|
||||||
|
}
|
||||||
|
|
||||||
|
tmpDir := t.TempDir()
|
||||||
|
lockPath := filepath.Join(tmpDir, "bd.sock.startlock")
|
||||||
|
socketPath := filepath.Join(tmpDir, "bd.sock")
|
||||||
|
|
||||||
|
// Create a stale lock file with PID 0 (will be detected as dead)
|
||||||
|
if err := os.WriteFile(lockPath, []byte("0\n"), 0o600); err != nil {
|
||||||
|
t.Fatalf("WriteFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// acquireStartLock should return false since it can't remove the stale lock
|
||||||
|
// Previously, this would cause infinite recursion and stack overflow
|
||||||
|
if acquireStartLock(lockPath, socketPath) {
|
||||||
|
t.Fatalf("expected acquireStartLock to fail when remove fails")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestDaemonAutostart_SocketHealthAndReadiness(t *testing.T) {
|
func TestDaemonAutostart_SocketHealthAndReadiness(t *testing.T) {
|
||||||
socketPath, cleanup := startTestRPCServer(t)
|
socketPath, cleanup := startTestRPCServer(t)
|
||||||
defer cleanup()
|
defer cleanup()
|
||||||
|
|||||||
Reference in New Issue
Block a user