fix: eliminate 5+ second delays from stale daemon.lock files (bd-htwx)

Root cause: When a bd daemon crashes, its daemon.lock file remains with
the old PID. If that PID gets reused by an unrelated process, the code
would wait 5 seconds for a socket that will never appear.

Fix: Use flock-based check as authoritative source for daemon liveness.
The OS releases flocks when a process dies, so this is immune to PID reuse.

Changes:
- handleExistingSocket: Check daemon flock before waiting for socket
- acquireStartLock: Verify daemon lock is held before waiting
- handleStaleLock: Use flock check to detect stale startlocks
- lockfile/process_*.go: Add pid <= 0 check to prevent false positives
  (PID 0 signals process group on Unix, not a specific process)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
beads/crew/emma
2025-12-31 13:03:49 -08:00
committed by Steve Yegge
parent e2121a2e07
commit 21de43538d
3 changed files with 46 additions and 11 deletions

View File

@@ -11,6 +11,7 @@ import (
"github.com/steveyegge/beads/internal/config"
"github.com/steveyegge/beads/internal/debug"
"github.com/steveyegge/beads/internal/lockfile"
"github.com/steveyegge/beads/internal/rpc"
"github.com/steveyegge/beads/internal/ui"
)
@@ -217,7 +218,7 @@ func acquireStartLock(lockPath, socketPath string) bool {
// nolint:gosec // G304: lockPath is derived from secure beads directory
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
if err != nil {
// Lock file exists - check if it's from a dead process (stale) or alive daemon
// Lock file exists - check if daemon is actually starting
lockPID, pidErr := readPIDFromFile(lockPath)
if pidErr != nil || !isPIDAlive(lockPID) {
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
@@ -227,7 +228,17 @@ func acquireStartLock(lockPath, socketPath string) bool {
return acquireStartLock(lockPath, socketPath)
}
// PID is alive - daemon is legitimately starting, wait for socket to be ready
// PID is alive - but is daemon actually running/starting?
// Use flock-based check as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(socketPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
// Daemon lock not held - the start attempt failed or process was reused
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
_ = os.Remove(lockPath)
return acquireStartLock(lockPath, socketPath)
}
// Daemon lock is held - daemon is legitimately starting, wait for socket
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
if waitForSocketReadiness(socketPath, 5*time.Second) {
return true
@@ -242,11 +253,24 @@ func acquireStartLock(lockPath, socketPath string) bool {
func handleStaleLock(lockPath, socketPath string) bool {
lockPID, err := readPIDFromFile(lockPath)
if err == nil && !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead), removing and retrying", lockPID)
// Check if PID is dead
if err != nil || !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
_ = os.Remove(lockPath)
return tryAutoStartDaemon(socketPath)
}
// PID is alive - but check daemon lock as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(socketPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
_ = os.Remove(lockPath)
return tryAutoStartDaemon(socketPath)
}
// Daemon lock is held - daemon is genuinely running but socket isn't ready
// This shouldn't happen normally, but don't clean up a legitimate lock
return false
}
@@ -260,19 +284,24 @@ func handleExistingSocket(socketPath string) bool {
return true
}
pidFile := getPIDFileForSocket(socketPath)
if pidFile != "" {
if pid, err := readPIDFromFile(pidFile); err == nil && isPIDAlive(pid) {
debugLog("daemon PID %d alive, waiting for socket", pid)
return waitForSocketReadiness(socketPath, 5*time.Second)
}
// Use flock-based check as authoritative source (immune to PID reuse)
// If daemon lock is not held, daemon is definitely dead regardless of PID file
beadsDir := filepath.Dir(socketPath)
if running, pid := lockfile.TryDaemonLock(beadsDir); running {
debugLog("daemon lock held (PID %d), waiting for socket", pid)
return waitForSocketReadiness(socketPath, 5*time.Second)
}
debugLog("socket is stale, cleaning up")
// Lock not held - daemon is dead, clean up stale artifacts
debugLog("socket is stale (daemon lock not held), cleaning up")
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
pidFile := getPIDFileForSocket(socketPath)
if pidFile != "" {
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
}
// Also clean up daemon.lock file (contains stale metadata)
lockFile := filepath.Join(beadsDir, "daemon.lock")
_ = os.Remove(lockFile) // Best-effort cleanup
return false
}