From 305345bf3635148147b275fcfafb1354867fd490 Mon Sep 17 00:00:00 2001 From: Steve Yegge Date: Fri, 26 Dec 2025 20:49:06 -0800 Subject: [PATCH] fix: Check tmux session before declaring lock stale Previously, gt doctor --fix would kill workers whose spawning process had exited, even though the Claude session was still running in tmux. Now both identity_check.go and CleanStaleLocks check if the tmux session exists before declaring a lock stale. A lock is only truly stale if BOTH the PID is dead AND the session does not exist. Also added ListSessionIDs() to tmux package to handle locks that store session IDs (%N or $N format) instead of session names. --- internal/doctor/identity_check.go | 37 ++++++++++- internal/lock/lock.go | 105 ++++++++++++++++++++++++++++++ internal/tmux/tmux.go | 40 +++++++++++- 3 files changed, 177 insertions(+), 5 deletions(-) diff --git a/internal/doctor/identity_check.go b/internal/doctor/identity_check.go index 5eb58b7a..4c61c8c7 100644 --- a/internal/doctor/identity_check.go +++ b/internal/doctor/identity_check.go @@ -48,26 +48,57 @@ func (c *IdentityCollisionCheck) Run(ctx *CheckContext) *CheckResult { } // Get active tmux sessions for cross-reference + // Build a set containing both session names AND session IDs + // because locks may store either format t := tmux.NewTmux() - sessions, _ := t.ListSessions() // Ignore errors - might not have tmux - sessionSet := make(map[string]bool) + + // Get session names + sessions, _ := t.ListSessions() // Returns session names for _, s := range sessions { sessionSet[s] = true } + // Also get session IDs to handle locks that store ID instead of name + // Lock files may contain session_id in formats like "%55" or "$55" + sessionIDs, _ := t.ListSessionIDs() // Returns map[name]id + for _, id := range sessionIDs { + sessionSet[id] = true + // Also add alternate formats + if len(id) > 0 { + if id[0] == '$' { + sessionSet["%"+id[1:]] = true // $55 -> %55 + } else if id[0] == '%' { + sessionSet["$"+id[1:]] = true // %55 -> $55 + } + } + } + var staleLocks []string var orphanedLocks []string var healthyLocks int for workerDir, info := range locks { + // First check if the session exists in tmux - that's the real indicator + // of whether the worker is alive. The PID in the lock is the spawning + // process, which may have exited even though Claude is still running. + sessionExists := info.SessionID != "" && sessionSet[info.SessionID] + if info.IsStale() { + // PID is dead - but is the session still alive? + if sessionExists { + // Session exists, so the worker is alive despite dead PID. + // This is normal - the spawner exits after launching Claude. + healthyLocks++ + continue + } + // Both PID dead AND session gone = truly stale staleLocks = append(staleLocks, fmt.Sprintf("%s (dead PID %d)", workerDir, info.PID)) continue } - // Check if session exists + // PID is alive - check if session exists if info.SessionID != "" && !sessionSet[info.SessionID] { // Lock has session ID but session doesn't exist // This could be a collision or orphan diff --git a/internal/lock/lock.go b/internal/lock/lock.go index 8eb75233..355fef54 100644 --- a/internal/lock/lock.go +++ b/internal/lock/lock.go @@ -14,6 +14,7 @@ import ( "errors" "fmt" "os" + "os/exec" "path/filepath" "syscall" "time" @@ -241,15 +242,31 @@ func FindAllLocks(root string) (map[string]*LockInfo, error) { // CleanStaleLocks removes all stale locks in a directory tree. // Returns the number of stale locks cleaned. +// A lock is only truly stale if BOTH the PID is dead AND the tmux session +// doesn't exist. This prevents killing active workers whose spawning process +// has exited (which is normal - Claude runs as a child in tmux). func CleanStaleLocks(root string) (int, error) { locks, err := FindAllLocks(root) if err != nil { return 0, err } + // Get active tmux sessions to verify locks + activeSessions := getActiveTmuxSessions() + sessionSet := make(map[string]bool) + for _, s := range activeSessions { + sessionSet[s] = true + } + cleaned := 0 for workerDir, info := range locks { if info.IsStale() { + // PID is dead, but check if session still exists + if info.SessionID != "" && sessionSet[info.SessionID] { + // Session exists - worker is alive, don't clean + continue + } + // Both PID dead AND no session = truly stale lock := New(workerDir) if err := lock.Release(); err == nil { cleaned++ @@ -260,6 +277,94 @@ func CleanStaleLocks(root string) (int, error) { return cleaned, nil } +// getActiveTmuxSessions returns a list of active tmux session identifiers. +// Returns both session names (gt-foo-bar) and session IDs in various formats +// (%N, $N) to handle different lock file formats. +func getActiveTmuxSessions() []string { + // Get both session name and ID to handle different lock formats + // Format: "session_name:session_id" e.g., "gt-beads-crew-dave:$55" + cmd := execCommand("tmux", "list-sessions", "-F", "#{session_name}:#{session_id}") + output, err := cmd.Output() + if err != nil { + return nil // tmux not running or not installed + } + + var sessions []string + for _, line := range splitLines(string(output)) { + if line == "" { + continue + } + // Parse "name:$id" format + parts := splitOnColon(line) + if len(parts) >= 1 { + sessions = append(sessions, parts[0]) // session name + } + if len(parts) >= 2 { + id := parts[1] + sessions = append(sessions, id) // $N format + // Also add %N format (old tmux style) for compatibility + if len(id) > 0 && id[0] == '$' { + sessions = append(sessions, "%"+id[1:]) + } + } + } + return sessions +} + +// splitOnColon splits on the first colon only (session names shouldn't have colons) +func splitOnColon(s string) []string { + idx := -1 + for i := 0; i < len(s); i++ { + if s[i] == ':' { + idx = i + break + } + } + if idx < 0 { + return []string{s} + } + return []string{s[:idx], s[idx+1:]} +} + +// execCommand is a wrapper for exec.Command to allow testing +var execCommand = func(name string, args ...string) interface{ Output() ([]byte, error) } { + return realExecCommand(name, args...) +} + +func realExecCommand(name string, args ...string) interface{ Output() ([]byte, error) } { + return &execCmdWrapper{name: name, args: args} +} + +type execCmdWrapper struct { + name string + args []string +} + +func (c *execCmdWrapper) Output() ([]byte, error) { + cmd := exec.Command(c.name, c.args...) + return cmd.Output() +} + +// splitLines splits a string into lines, handling both \n and \r\n +func splitLines(s string) []string { + var lines []string + var current []byte + for i := 0; i < len(s); i++ { + if s[i] == '\n' { + lines = append(lines, string(current)) + current = nil + } else if s[i] == '\r' { + // Skip \r + } else { + current = append(current, s[i]) + } + } + if len(current) > 0 { + lines = append(lines, string(current)) + } + return lines +} + // DetectCollisions finds workers with multiple agents claiming the same identity. // This detects the case where multiple processes think they own the same worker // by comparing tmux sessions with lock files. diff --git a/internal/tmux/tmux.go b/internal/tmux/tmux.go index baffbd95..be5bf6db 100644 --- a/internal/tmux/tmux.go +++ b/internal/tmux/tmux.go @@ -118,6 +118,37 @@ func (t *Tmux) ListSessions() ([]string, error) { return strings.Split(out, "\n"), nil } +// ListSessionIDs returns a map of session name to session ID. +// Session IDs are in the format "$N" where N is a number. +func (t *Tmux) ListSessionIDs() (map[string]string, error) { + out, err := t.run("list-sessions", "-F", "#{session_name}:#{session_id}") + if err != nil { + if errors.Is(err, ErrNoServer) { + return nil, nil // No server = no sessions + } + return nil, err + } + + if out == "" { + return nil, nil + } + + result := make(map[string]string) + for _, line := range strings.Split(out, "\n") { + if line == "" { + continue + } + // Parse "name:$id" format + idx := strings.Index(line, ":") + if idx > 0 && idx < len(line)-1 { + name := line[:idx] + id := line[idx+1:] + result[name] = id + } + } + return result, nil +} + // SendKeys sends keystrokes to a session and presses Enter. // Always sends Enter as a separate command for reliability. // Uses a debounce delay between paste and Enter to ensure paste completes. @@ -621,15 +652,20 @@ func (t *Tmux) SwitchClient(targetSession string) error { // SetCrewCycleBindings sets up C-b n/p to cycle through crew sessions in the same rig. // This allows quick switching between crew members without using the session picker. +// +// IMPORTANT: We pass #{session_name} to the command because run-shell doesn't +// reliably preserve the session context. tmux expands #{session_name} at binding +// resolution time (when the key is pressed), giving us the correct session. func (t *Tmux) SetCrewCycleBindings(session string) error { // C-b n → gt crew next (switch to next crew session) + // #{session_name} is expanded by tmux when the key is pressed if _, err := t.run("bind-key", "-T", "prefix", "n", - "run-shell", "gt crew next"); err != nil { + "run-shell", "gt crew next --session '#{session_name}'"); err != nil { return err } // C-b p → gt crew prev (switch to previous crew session) if _, err := t.run("bind-key", "-T", "prefix", "p", - "run-shell", "gt crew prev"); err != nil { + "run-shell", "gt crew prev --session '#{session_name}'"); err != nil { return err } return nil