Fix stack overflow in acquireStartLock due to infinite recursion (#1131)

The acquireStartLock function would recursively call itself after
attempting to remove a stale lock file. If os.Remove failed (due to
permissions, race conditions, etc.), the error was silently ignored
with `_`, causing infinite recursion until the 1GB stack limit was
exceeded.

Changes:
- Convert recursive calls to a bounded retry loop (max 3 attempts)
- Check removeFileFn return value before retrying
- Apply same fix to handleStaleLock which had the same issue
- Add test to verify function returns false when remove fails

Fixes the stack overflow crash that occurred when running any bd
command with a stale or problematic lock file.

Co-authored-by: Steven Syrek <steven.syrek@deepl.com>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Steven Syrek
2026-01-16 23:36:59 +01:00
committed by GitHub
parent 3d813c2143
commit b51c0101ba
2 changed files with 63 additions and 13 deletions

View File

@@ -220,17 +220,29 @@ func acquireStartLock(lockPath, socketPath string) bool {
return false
}
// nolint:gosec // G304: lockPath is derived from secure beads directory
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
if err != nil {
// Bounded retry loop to prevent infinite recursion when lock cleanup fails
const maxRetries = 3
for attempt := 0; attempt < maxRetries; attempt++ {
// nolint:gosec // G304: lockPath is derived from secure beads directory
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
if err == nil {
// Successfully acquired lock
_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
_ = lockFile.Close() // Best-effort close during startup
return true
}
// Lock file exists - check if daemon is actually starting
lockPID, pidErr := readPIDFromFile(lockPath)
if pidErr != nil || !isPIDAlive(lockPID) {
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
debugLog("startlock is stale (PID %d dead or unreadable), cleaning up", lockPID)
_ = os.Remove(lockPath)
// Retry lock acquisition after cleanup
return acquireStartLock(lockPath, socketPath)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove stale lock file: %v", rmErr)
return false // Can't acquire lock if we can't clean up
}
// Continue to next iteration to retry lock acquisition
continue
}
// PID is alive - but is daemon actually running/starting?
@@ -239,8 +251,12 @@ func acquireStartLock(lockPath, socketPath string) bool {
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
// Daemon lock not held - the start attempt failed or process was reused
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
_ = os.Remove(lockPath)
return acquireStartLock(lockPath, socketPath)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove orphaned lock file: %v", rmErr)
return false // Can't acquire lock if we can't clean up
}
// Continue to next iteration to retry lock acquisition
continue
}
// Daemon lock is held - daemon is legitimately starting, wait for socket
@@ -251,9 +267,8 @@ func acquireStartLock(lockPath, socketPath string) bool {
return handleStaleLock(lockPath, socketPath)
}
_, _ = fmt.Fprintf(lockFile, "%d\n", os.Getpid())
_ = lockFile.Close() // Best-effort close during startup
return true
debugLog("failed to acquire start lock after %d attempts", maxRetries)
return false
}
func handleStaleLock(lockPath, socketPath string) bool {
@@ -262,7 +277,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
// Check if PID is dead
if err != nil || !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
_ = os.Remove(lockPath)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove stale lock in handleStaleLock: %v", rmErr)
return false
}
return tryAutoStartDaemon(socketPath)
}
@@ -270,7 +288,10 @@ func handleStaleLock(lockPath, socketPath string) bool {
beadsDir := filepath.Dir(dbPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
_ = os.Remove(lockPath)
if rmErr := removeFileFn(lockPath); rmErr != nil {
debugLog("failed to remove orphaned lock in handleStaleLock: %v", rmErr)
return false
}
return tryAutoStartDaemon(socketPath)
}