fix: eliminate 5+ second delays from stale daemon.lock files (bd-htwx)
Root cause: When a bd daemon crashes, its daemon.lock file remains with the old PID. If that PID gets reused by an unrelated process, the code would wait 5 seconds for a socket that will never appear. Fix: Use flock-based check as authoritative source for daemon liveness. The OS releases flocks when a process dies, so this is immune to PID reuse. Changes: - handleExistingSocket: Check daemon flock before waiting for socket - acquireStartLock: Verify daemon lock is held before waiting - handleStaleLock: Use flock check to detect stale startlocks - lockfile/process_*.go: Add pid <= 0 check to prevent false positives (PID 0 signals process group on Unix, not a specific process) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
committed by
Steve Yegge
parent
e2121a2e07
commit
21de43538d
@@ -11,6 +11,7 @@ import (
|
|||||||
|
|
||||||
"github.com/steveyegge/beads/internal/config"
|
"github.com/steveyegge/beads/internal/config"
|
||||||
"github.com/steveyegge/beads/internal/debug"
|
"github.com/steveyegge/beads/internal/debug"
|
||||||
|
"github.com/steveyegge/beads/internal/lockfile"
|
||||||
"github.com/steveyegge/beads/internal/rpc"
|
"github.com/steveyegge/beads/internal/rpc"
|
||||||
"github.com/steveyegge/beads/internal/ui"
|
"github.com/steveyegge/beads/internal/ui"
|
||||||
)
|
)
|
||||||
@@ -217,7 +218,7 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
// nolint:gosec // G304: lockPath is derived from secure beads directory
|
// nolint:gosec // G304: lockPath is derived from secure beads directory
|
||||||
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
|
lockFile, err := os.OpenFile(lockPath, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0600)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Lock file exists - check if it's from a dead process (stale) or alive daemon
|
// Lock file exists - check if daemon is actually starting
|
||||||
lockPID, pidErr := readPIDFromFile(lockPath)
|
lockPID, pidErr := readPIDFromFile(lockPath)
|
||||||
if pidErr != nil || !isPIDAlive(lockPID) {
|
if pidErr != nil || !isPIDAlive(lockPID) {
|
||||||
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
|
// Stale lock from crashed process - clean up immediately (avoids 5s wait)
|
||||||
@@ -227,7 +228,17 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
return acquireStartLock(lockPath, socketPath)
|
return acquireStartLock(lockPath, socketPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
// PID is alive - daemon is legitimately starting, wait for socket to be ready
|
// PID is alive - but is daemon actually running/starting?
|
||||||
|
// Use flock-based check as authoritative source (immune to PID reuse)
|
||||||
|
beadsDir := filepath.Dir(socketPath)
|
||||||
|
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||||
|
// Daemon lock not held - the start attempt failed or process was reused
|
||||||
|
debugLog("startlock PID %d alive but daemon lock not held, cleaning up", lockPID)
|
||||||
|
_ = os.Remove(lockPath)
|
||||||
|
return acquireStartLock(lockPath, socketPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Daemon lock is held - daemon is legitimately starting, wait for socket
|
||||||
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
|
debugLog("another process (PID %d) is starting daemon, waiting for readiness", lockPID)
|
||||||
if waitForSocketReadiness(socketPath, 5*time.Second) {
|
if waitForSocketReadiness(socketPath, 5*time.Second) {
|
||||||
return true
|
return true
|
||||||
@@ -242,11 +253,24 @@ func acquireStartLock(lockPath, socketPath string) bool {
|
|||||||
|
|
||||||
func handleStaleLock(lockPath, socketPath string) bool {
|
func handleStaleLock(lockPath, socketPath string) bool {
|
||||||
lockPID, err := readPIDFromFile(lockPath)
|
lockPID, err := readPIDFromFile(lockPath)
|
||||||
if err == nil && !isPIDAlive(lockPID) {
|
|
||||||
debugLog("lock is stale (PID %d dead), removing and retrying", lockPID)
|
// Check if PID is dead
|
||||||
|
if err != nil || !isPIDAlive(lockPID) {
|
||||||
|
debugLog("lock is stale (PID %d dead or unreadable), removing and retrying", lockPID)
|
||||||
_ = os.Remove(lockPath)
|
_ = os.Remove(lockPath)
|
||||||
return tryAutoStartDaemon(socketPath)
|
return tryAutoStartDaemon(socketPath)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// PID is alive - but check daemon lock as authoritative source (immune to PID reuse)
|
||||||
|
beadsDir := filepath.Dir(socketPath)
|
||||||
|
if running, _ := lockfile.TryDaemonLock(beadsDir); !running {
|
||||||
|
debugLog("lock PID %d alive but daemon lock not held, removing and retrying", lockPID)
|
||||||
|
_ = os.Remove(lockPath)
|
||||||
|
return tryAutoStartDaemon(socketPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Daemon lock is held - daemon is genuinely running but socket isn't ready
|
||||||
|
// This shouldn't happen normally, but don't clean up a legitimate lock
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,19 +284,24 @@ func handleExistingSocket(socketPath string) bool {
|
|||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
pidFile := getPIDFileForSocket(socketPath)
|
// Use flock-based check as authoritative source (immune to PID reuse)
|
||||||
if pidFile != "" {
|
// If daemon lock is not held, daemon is definitely dead regardless of PID file
|
||||||
if pid, err := readPIDFromFile(pidFile); err == nil && isPIDAlive(pid) {
|
beadsDir := filepath.Dir(socketPath)
|
||||||
debugLog("daemon PID %d alive, waiting for socket", pid)
|
if running, pid := lockfile.TryDaemonLock(beadsDir); running {
|
||||||
return waitForSocketReadiness(socketPath, 5*time.Second)
|
debugLog("daemon lock held (PID %d), waiting for socket", pid)
|
||||||
}
|
return waitForSocketReadiness(socketPath, 5*time.Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
debugLog("socket is stale, cleaning up")
|
// Lock not held - daemon is dead, clean up stale artifacts
|
||||||
|
debugLog("socket is stale (daemon lock not held), cleaning up")
|
||||||
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
|
_ = os.Remove(socketPath) // Best-effort cleanup, file may not exist
|
||||||
|
pidFile := getPIDFileForSocket(socketPath)
|
||||||
if pidFile != "" {
|
if pidFile != "" {
|
||||||
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
|
_ = os.Remove(pidFile) // Best-effort cleanup, file may not exist
|
||||||
}
|
}
|
||||||
|
// Also clean up daemon.lock file (contains stale metadata)
|
||||||
|
lockFile := filepath.Join(beadsDir, "daemon.lock")
|
||||||
|
_ = os.Remove(lockFile) // Best-effort cleanup
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -8,5 +8,8 @@ import (
|
|||||||
|
|
||||||
// isProcessRunning checks if a process with the given PID is running
|
// isProcessRunning checks if a process with the given PID is running
|
||||||
func isProcessRunning(pid int) bool {
|
func isProcessRunning(pid int) bool {
|
||||||
|
if pid <= 0 {
|
||||||
|
return false // Invalid PID (0 would signal our process group, not a specific process)
|
||||||
|
}
|
||||||
return syscall.Kill(pid, 0) == nil
|
return syscall.Kill(pid, 0) == nil
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,6 +10,9 @@ const stillActive = 259
|
|||||||
|
|
||||||
// isProcessRunning checks if a process with the given PID is running
|
// isProcessRunning checks if a process with the given PID is running
|
||||||
func isProcessRunning(pid int) bool {
|
func isProcessRunning(pid int) bool {
|
||||||
|
if pid <= 0 {
|
||||||
|
return false // Invalid PID
|
||||||
|
}
|
||||||
handle, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
|
handle, err := windows.OpenProcess(windows.PROCESS_QUERY_LIMITED_INFORMATION, false, uint32(pid))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false
|
return false
|
||||||
|
|||||||
Reference in New Issue
Block a user