fix: eliminate 5+ second delays from stale daemon.lock files (bd-htwx)

Root cause: When a bd daemon crashes, its daemon.lock file remains with
the old PID. If that PID gets reused by an unrelated process, the code
would wait 5 seconds for a socket that will never appear.

Fix: Use flock-based check as authoritative source for daemon liveness.
The OS releases flocks when a process dies, so this is immune to PID reuse.

Changes:
- handleExistingSocket: Check daemon flock before waiting for socket
- acquireStartLock: Verify daemon lock is held before waiting
- handleStaleLock: Use flock check to detect stale startlocks
- lockfile/process_*.go: Add pid <= 0 check to prevent false positives
  (PID 0 signals process group on Unix, not a specific process)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
beads/crew/emma
2025-12-31 13:03:49 -08:00
committed by Steve Yegge
parent e2121a2e07
commit 21de43538d
3 changed files with 46 additions and 11 deletions

View File

@@ -11,6 +11,7 @@ import (
"github.com/steveyegge/beads/internal/config" "github.com/steveyegge/beads/internal/config"
"github.com/steveyegge/beads/internal/debug" "github.com/steveyegge/beads/internal/debug"
"github.com/steveyegge/beads/internal/lockfile"
"github.com/steveyegge/beads/internal/rpc" "github.com/steveyegge/beads/internal/rpc"
"github.com/steveyegge/beads/internal/ui" "github.com/steveyegge/beads/internal/ui"
) )
@@ -217,7 +218,7 @@ func acquireStartLock(lockPath, socketPath string) bool {
// nolint:gosec // G304: lockPath is derived from secure beads directory // nolint:gosec // G304: lockPath is derived from secure beads directory
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600) lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
if err != nil { if err != nil {
// Lock file exists - check if it's from a dead process (stale) or alive daemon // Lock file exists - check if daemon is actually starting
lockPID, pidErr := readPIDFromFile(lockPath) lockPID, pidErr := readPIDFromFile(lockPath)
if pidErr != nil || !isPIDAlive(lockPID) { if pidErr != nil || !isPIDAlive(lockPID) {
// Stale lock from crashed process - clean up immediately (avoids 5s wait) // Stale lock from crashed process - clean up immediately (avoids 5s wait)
@@ -227,7 +228,17 @@ func acquireStartLock(lockPath, socketPath string) bool {
return acquireStartLock(lockPath, socketPath) return acquireStartLock(lockPath, socketPath)
} }
// PID is alive - daemon is legitimately starting, wait for socket to be ready // PID is alive - but is daemon actually running/starting?
// Use flock-based check as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(socketPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
// Daemon lock not held - the start attempt failed or process was reused
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
_ = os.Remove(lockPath)
return acquireStartLock(lockPath, socketPath)
}
// Daemon lock is held - daemon is legitimately starting, wait for socket
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID) debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
if waitForSocketReadiness(socketPath, 5*time.Second) { if waitForSocketReadiness(socketPath, 5*time.Second) {
return true return true
@@ -242,11 +253,24 @@ func acquireStartLock(lockPath, socketPath string) bool {
func handleStaleLock(lockPath, socketPath string) bool { func handleStaleLock(lockPath, socketPath string) bool {
lockPID, err := readPIDFromFile(lockPath) lockPID, err := readPIDFromFile(lockPath)
if err == nil && !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead), removing and retrying", lockPID) // Check if PID is dead
if err != nil || !isPIDAlive(lockPID) {
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
_ = os.Remove(lockPath) _ = os.Remove(lockPath)
return tryAutoStartDaemon(socketPath) return tryAutoStartDaemon(socketPath)
} }
// PID is alive - but check daemon lock as authoritative source (immune to PID reuse)
beadsDir := filepath.Dir(socketPath)
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
_ = os.Remove(lockPath)
return tryAutoStartDaemon(socketPath)
}
// Daemon lock is held - daemon is genuinely running but socket isn't ready
// This shouldn't happen normally, but don't clean up a legitimate lock
return false return false
} }
@@ -260,19 +284,24 @@ func handleExistingSocket(socketPath string) bool {
return true return true
} }
pidFile := getPIDFileForSocket(socketPath) // Use flock-based check as authoritative source (immune to PID reuse)
if pidFile != "" { // If daemon lock is not held, daemon is definitely dead regardless of PID file
if pid, err := readPIDFromFile(pidFile); err == nil && isPIDAlive(pid) { beadsDir := filepath.Dir(socketPath)
debugLog("daemon PID %d alive, waiting for socket", pid) if running, pid := lockfile.TryDaemonLock(beadsDir); running {
return waitForSocketReadiness(socketPath, 5*time.Second) debugLog("daemon lock held (PID %d), waiting for socket", pid)
} return waitForSocketReadiness(socketPath, 5*time.Second)
} }
debugLog("socket is stale, cleaning up") // Lock not held - daemon is dead, clean up stale artifacts
debugLog("socket is stale (daemon lock not held), cleaning up")
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist _ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
pidFile := getPIDFileForSocket(socketPath)
if pidFile != "" { if pidFile != "" {
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist _ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
} }
// Also clean up daemon.lock file (contains stale metadata)
lockFile := filepath.Join(beadsDir, "daemon.lock")
_ = os.Remove(lockFile) // Best-effort cleanup
return false return false
} }

View File

@@ -8,5 +8,8 @@ import (
// isProcessRunning checks if a process with the given PID is running // isProcessRunning checks if a process with the given PID is running
func isProcessRunning(pid int) bool { func isProcessRunning(pid int) bool {
if pid <= 0 {
return false // Invalid PID (0 would signal our process group, not a specific process)
}
return syscall.Kill(pid, 0) == nil return syscall.Kill(pid, 0) == nil
} }

View File

@@ -10,6 +10,9 @@ const stillActive = 259
// isProcessRunning checks if a process with the given PID is running // isProcessRunning checks if a process with the given PID is running
func isProcessRunning(pid int) bool { func isProcessRunning(pid int) bool {
if pid <= 0 {
return false // Invalid PID
}
handle, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid)) handle, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
if err != nil { if err != nil {
return false return false