fix: eliminate 5+ second delays from stale daemon.lock files (bd-htwx)
Root cause: When a bd daemon crashes, its daemon.lock file remains with the old PID. If that PID gets reused by an unrelated process, the code would wait 5 seconds for a socket that will never appear. Fix: Use flock-based check as authoritative source for daemon liveness. The OS releases flocks when a process dies, so this is immune to PID reuse. Changes: - handleExistingSocket: Check daemon flock before waiting for socket - acquireStartLock: Verify daemon lock is held before waiting - handleStaleLock: Use flock check to detect stale startlocks - lockfile/process_*.go: Add pid <= 0 check to prevent false positives (PID 0 signals process group on Unix, not a specific process) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
e2121a2e07
commit
21de43538d
@@ -11,6 +11,7 @@ import (
|
||||
|
||||
"github.com/steveyegge/beads/internal/config"
|
||||
"github.com/steveyegge/beads/internal/debug"
|
||||
"github.com/steveyegge/beads/internal/lockfile"
|
||||
"github.com/steveyegge/beads/internal/rpc"
|
||||
"github.com/steveyegge/beads/internal/ui"
|
||||
)
|
||||
@@ -217,7 +218,7 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
||||
// nolint:gosec // G304: lockPath is derived from secure beads directory
|
||||
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
|
||||
if err != nil {
|
||||
// Lock file exists - check if it's from a dead process (stale) or alive daemon
|
||||
// Lock file exists - check if daemon is actually starting
|
||||
lockPID, pidErr := readPIDFromFile(lockPath)
|
||||
if pidErr != nil || !isPIDAlive(lockPID) {
|
||||
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
|
||||
@@ -227,7 +228,17 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
||||
return acquireStartLock(lockPath, socketPath)
|
||||
}
|
||||
|
||||
// PID is alive - daemon is legitimately starting, wait for socket to be ready
|
||||
// PID is alive - but is daemon actually running/starting?
|
||||
// Use flock-based check as authoritative source (immune to PID reuse)
|
||||
beadsDir := filepath.Dir(socketPath)
|
||||
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||
// Daemon lock not held - the start attempt failed or process was reused
|
||||
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
|
||||
_ = os.Remove(lockPath)
|
||||
return acquireStartLock(lockPath, socketPath)
|
||||
}
|
||||
|
||||
// Daemon lock is held - daemon is legitimately starting, wait for socket
|
||||
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
|
||||
if waitForSocketReadiness(socketPath, 5*time.Second) {
|
||||
return true
|
||||
@@ -242,11 +253,24 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
||||
|
||||
func handleStaleLock(lockPath, socketPath string) bool {
|
||||
lockPID, err := readPIDFromFile(lockPath)
|
||||
if err == nil && !isPIDAlive(lockPID) {
|
||||
debugLog("lock is stale (PID %d dead), removing and retrying", lockPID)
|
||||
|
||||
// Check if PID is dead
|
||||
if err != nil || !isPIDAlive(lockPID) {
|
||||
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
|
||||
_ = os.Remove(lockPath)
|
||||
return tryAutoStartDaemon(socketPath)
|
||||
}
|
||||
|
||||
// PID is alive - but check daemon lock as authoritative source (immune to PID reuse)
|
||||
beadsDir := filepath.Dir(socketPath)
|
||||
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
|
||||
_ = os.Remove(lockPath)
|
||||
return tryAutoStartDaemon(socketPath)
|
||||
}
|
||||
|
||||
// Daemon lock is held - daemon is genuinely running but socket isn't ready
|
||||
// This shouldn't happen normally, but don't clean up a legitimate lock
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -260,19 +284,24 @@ func handleExistingSocket(socketPath string) bool {
|
||||
return true
|
||||
}
|
||||
|
||||
pidFile := getPIDFileForSocket(socketPath)
|
||||
if pidFile != "" {
|
||||
if pid, err := readPIDFromFile(pidFile); err == nil && isPIDAlive(pid) {
|
||||
debugLog("daemon PID %d alive, waiting for socket", pid)
|
||||
return waitForSocketReadiness(socketPath, 5*time.Second)
|
||||
}
|
||||
// Use flock-based check as authoritative source (immune to PID reuse)
|
||||
// If daemon lock is not held, daemon is definitely dead regardless of PID file
|
||||
beadsDir := filepath.Dir(socketPath)
|
||||
if running, pid := lockfile.TryDaemonLock(beadsDir); running {
|
||||
debugLog("daemon lock held (PID %d), waiting for socket", pid)
|
||||
return waitForSocketReadiness(socketPath, 5*time.Second)
|
||||
}
|
||||
|
||||
debugLog("socket is stale, cleaning up")
|
||||
// Lock not held - daemon is dead, clean up stale artifacts
|
||||
debugLog("socket is stale (daemon lock not held), cleaning up")
|
||||
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
|
||||
pidFile := getPIDFileForSocket(socketPath)
|
||||
if pidFile != "" {
|
||||
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
|
||||
}
|
||||
// Also clean up daemon.lock file (contains stale metadata)
|
||||
lockFile := filepath.Join(beadsDir, "daemon.lock")
|
||||
_ = os.Remove(lockFile) // Best-effort cleanup
|
||||
return false
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user