fix: Check tmux session before declaring lock stale
Previously, gt doctor --fix would kill workers whose spawning process had exited, even though the Claude session was still running in tmux. Now both identity_check.go and CleanStaleLocks check if the tmux session exists before declaring a lock stale. A lock is only truly stale if BOTH the PID is dead AND the session does not exist. Also added ListSessionIDs() to tmux package to handle locks that store session IDs (%N or $N format) instead of session names.
This commit is contained in:
@@ -48,26 +48,57 @@ func (c *IdentityCollisionCheck) Run(ctx *CheckContext) *CheckResult {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Get active tmux sessions for cross-reference
|
// Get active tmux sessions for cross-reference
|
||||||
|
// Build a set containing both session names AND session IDs
|
||||||
|
// because locks may store either format
|
||||||
t := tmux.NewTmux()
|
t := tmux.NewTmux()
|
||||||
sessions, _ := t.ListSessions() // Ignore errors - might not have tmux
|
|
||||||
|
|
||||||
sessionSet := make(map[string]bool)
|
sessionSet := make(map[string]bool)
|
||||||
|
|
||||||
|
// Get session names
|
||||||
|
sessions, _ := t.ListSessions() // Returns session names
|
||||||
for _, s := range sessions {
|
for _, s := range sessions {
|
||||||
sessionSet[s] = true
|
sessionSet[s] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Also get session IDs to handle locks that store ID instead of name
|
||||||
|
// Lock files may contain session_id in formats like "%55" or "$55"
|
||||||
|
sessionIDs, _ := t.ListSessionIDs() // Returns map[name]id
|
||||||
|
for _, id := range sessionIDs {
|
||||||
|
sessionSet[id] = true
|
||||||
|
// Also add alternate formats
|
||||||
|
if len(id) > 0 {
|
||||||
|
if id[0] == '$' {
|
||||||
|
sessionSet["%"+id[1:]] = true // $55 -> %55
|
||||||
|
} else if id[0] == '%' {
|
||||||
|
sessionSet["$"+id[1:]] = true // %55 -> $55
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
var staleLocks []string
|
var staleLocks []string
|
||||||
var orphanedLocks []string
|
var orphanedLocks []string
|
||||||
var healthyLocks int
|
var healthyLocks int
|
||||||
|
|
||||||
for workerDir, info := range locks {
|
for workerDir, info := range locks {
|
||||||
|
// First check if the session exists in tmux - that's the real indicator
|
||||||
|
// of whether the worker is alive. The PID in the lock is the spawning
|
||||||
|
// process, which may have exited even though Claude is still running.
|
||||||
|
sessionExists := info.SessionID != "" && sessionSet[info.SessionID]
|
||||||
|
|
||||||
if info.IsStale() {
|
if info.IsStale() {
|
||||||
|
// PID is dead - but is the session still alive?
|
||||||
|
if sessionExists {
|
||||||
|
// Session exists, so the worker is alive despite dead PID.
|
||||||
|
// This is normal - the spawner exits after launching Claude.
|
||||||
|
healthyLocks++
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Both PID dead AND session gone = truly stale
|
||||||
staleLocks = append(staleLocks,
|
staleLocks = append(staleLocks,
|
||||||
fmt.Sprintf("%s (dead PID %d)", workerDir, info.PID))
|
fmt.Sprintf("%s (dead PID %d)", workerDir, info.PID))
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if session exists
|
// PID is alive - check if session exists
|
||||||
if info.SessionID != "" && !sessionSet[info.SessionID] {
|
if info.SessionID != "" && !sessionSet[info.SessionID] {
|
||||||
// Lock has session ID but session doesn't exist
|
// Lock has session ID but session doesn't exist
|
||||||
// This could be a collision or orphan
|
// This could be a collision or orphan
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
|
"os/exec"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
@@ -241,15 +242,31 @@ func FindAllLocks(root string) (map[string]*LockInfo, error) {
|
|||||||
|
|
||||||
// CleanStaleLocks removes all stale locks in a directory tree.
|
// CleanStaleLocks removes all stale locks in a directory tree.
|
||||||
// Returns the number of stale locks cleaned.
|
// Returns the number of stale locks cleaned.
|
||||||
|
// A lock is only truly stale if BOTH the PID is dead AND the tmux session
|
||||||
|
// doesn't exist. This prevents killing active workers whose spawning process
|
||||||
|
// has exited (which is normal - Claude runs as a child in tmux).
|
||||||
func CleanStaleLocks(root string) (int, error) {
|
func CleanStaleLocks(root string) (int, error) {
|
||||||
locks, err := FindAllLocks(root)
|
locks, err := FindAllLocks(root)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return 0, err
|
return 0, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Get active tmux sessions to verify locks
|
||||||
|
activeSessions := getActiveTmuxSessions()
|
||||||
|
sessionSet := make(map[string]bool)
|
||||||
|
for _, s := range activeSessions {
|
||||||
|
sessionSet[s] = true
|
||||||
|
}
|
||||||
|
|
||||||
cleaned := 0
|
cleaned := 0
|
||||||
for workerDir, info := range locks {
|
for workerDir, info := range locks {
|
||||||
if info.IsStale() {
|
if info.IsStale() {
|
||||||
|
// PID is dead, but check if session still exists
|
||||||
|
if info.SessionID != "" && sessionSet[info.SessionID] {
|
||||||
|
// Session exists - worker is alive, don't clean
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Both PID dead AND no session = truly stale
|
||||||
lock := New(workerDir)
|
lock := New(workerDir)
|
||||||
if err := lock.Release(); err == nil {
|
if err := lock.Release(); err == nil {
|
||||||
cleaned++
|
cleaned++
|
||||||
@@ -260,6 +277,94 @@ func CleanStaleLocks(root string) (int, error) {
|
|||||||
return cleaned, nil
|
return cleaned, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getActiveTmuxSessions returns a list of active tmux session identifiers.
|
||||||
|
// Returns both session names (gt-foo-bar) and session IDs in various formats
|
||||||
|
// (%N, $N) to handle different lock file formats.
|
||||||
|
func getActiveTmuxSessions() []string {
|
||||||
|
// Get both session name and ID to handle different lock formats
|
||||||
|
// Format: "session_name:session_id" e.g., "gt-beads-crew-dave:$55"
|
||||||
|
cmd := execCommand("tmux", "list-sessions", "-F", "#{session_name}:#{session_id}")
|
||||||
|
output, err := cmd.Output()
|
||||||
|
if err != nil {
|
||||||
|
return nil // tmux not running or not installed
|
||||||
|
}
|
||||||
|
|
||||||
|
var sessions []string
|
||||||
|
for _, line := range splitLines(string(output)) {
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Parse "name:$id" format
|
||||||
|
parts := splitOnColon(line)
|
||||||
|
if len(parts) >= 1 {
|
||||||
|
sessions = append(sessions, parts[0]) // session name
|
||||||
|
}
|
||||||
|
if len(parts) >= 2 {
|
||||||
|
id := parts[1]
|
||||||
|
sessions = append(sessions, id) // $N format
|
||||||
|
// Also add %N format (old tmux style) for compatibility
|
||||||
|
if len(id) > 0 && id[0] == '$' {
|
||||||
|
sessions = append(sessions, "%"+id[1:])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sessions
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitOnColon splits on the first colon only (session names shouldn't have colons)
|
||||||
|
func splitOnColon(s string) []string {
|
||||||
|
idx := -1
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
if s[i] == ':' {
|
||||||
|
idx = i
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if idx < 0 {
|
||||||
|
return []string{s}
|
||||||
|
}
|
||||||
|
return []string{s[:idx], s[idx+1:]}
|
||||||
|
}
|
||||||
|
|
||||||
|
// execCommand is a wrapper for exec.Command to allow testing
|
||||||
|
var execCommand = func(name string, args ...string) interface{ Output() ([]byte, error) } {
|
||||||
|
return realExecCommand(name, args...)
|
||||||
|
}
|
||||||
|
|
||||||
|
func realExecCommand(name string, args ...string) interface{ Output() ([]byte, error) } {
|
||||||
|
return &execCmdWrapper{name: name, args: args}
|
||||||
|
}
|
||||||
|
|
||||||
|
type execCmdWrapper struct {
|
||||||
|
name string
|
||||||
|
args []string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *execCmdWrapper) Output() ([]byte, error) {
|
||||||
|
cmd := exec.Command(c.name, c.args...)
|
||||||
|
return cmd.Output()
|
||||||
|
}
|
||||||
|
|
||||||
|
// splitLines splits a string into lines, handling both \n and \r\n
|
||||||
|
func splitLines(s string) []string {
|
||||||
|
var lines []string
|
||||||
|
var current []byte
|
||||||
|
for i := 0; i < len(s); i++ {
|
||||||
|
if s[i] == '\n' {
|
||||||
|
lines = append(lines, string(current))
|
||||||
|
current = nil
|
||||||
|
} else if s[i] == '\r' {
|
||||||
|
// Skip \r
|
||||||
|
} else {
|
||||||
|
current = append(current, s[i])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(current) > 0 {
|
||||||
|
lines = append(lines, string(current))
|
||||||
|
}
|
||||||
|
return lines
|
||||||
|
}
|
||||||
|
|
||||||
// DetectCollisions finds workers with multiple agents claiming the same identity.
|
// DetectCollisions finds workers with multiple agents claiming the same identity.
|
||||||
// This detects the case where multiple processes think they own the same worker
|
// This detects the case where multiple processes think they own the same worker
|
||||||
// by comparing tmux sessions with lock files.
|
// by comparing tmux sessions with lock files.
|
||||||
|
|||||||
@@ -118,6 +118,37 @@ func (t *Tmux) ListSessions() ([]string, error) {
|
|||||||
return strings.Split(out, "\n"), nil
|
return strings.Split(out, "\n"), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListSessionIDs returns a map of session name to session ID.
|
||||||
|
// Session IDs are in the format "$N" where N is a number.
|
||||||
|
func (t *Tmux) ListSessionIDs() (map[string]string, error) {
|
||||||
|
out, err := t.run("list-sessions", "-F", "#{session_name}:#{session_id}")
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, ErrNoServer) {
|
||||||
|
return nil, nil // No server = no sessions
|
||||||
|
}
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if out == "" {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
result := make(map[string]string)
|
||||||
|
for _, line := range strings.Split(out, "\n") {
|
||||||
|
if line == "" {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
// Parse "name:$id" format
|
||||||
|
idx := strings.Index(line, ":")
|
||||||
|
if idx > 0 && idx < len(line)-1 {
|
||||||
|
name := line[:idx]
|
||||||
|
id := line[idx+1:]
|
||||||
|
result[name] = id
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result, nil
|
||||||
|
}
|
||||||
|
|
||||||
// SendKeys sends keystrokes to a session and presses Enter.
|
// SendKeys sends keystrokes to a session and presses Enter.
|
||||||
// Always sends Enter as a separate command for reliability.
|
// Always sends Enter as a separate command for reliability.
|
||||||
// Uses a debounce delay between paste and Enter to ensure paste completes.
|
// Uses a debounce delay between paste and Enter to ensure paste completes.
|
||||||
@@ -621,15 +652,20 @@ func (t *Tmux) SwitchClient(targetSession string) error {
|
|||||||
|
|
||||||
// SetCrewCycleBindings sets up C-b n/p to cycle through crew sessions in the same rig.
|
// SetCrewCycleBindings sets up C-b n/p to cycle through crew sessions in the same rig.
|
||||||
// This allows quick switching between crew members without using the session picker.
|
// This allows quick switching between crew members without using the session picker.
|
||||||
|
//
|
||||||
|
// IMPORTANT: We pass #{session_name} to the command because run-shell doesn't
|
||||||
|
// reliably preserve the session context. tmux expands #{session_name} at binding
|
||||||
|
// resolution time (when the key is pressed), giving us the correct session.
|
||||||
func (t *Tmux) SetCrewCycleBindings(session string) error {
|
func (t *Tmux) SetCrewCycleBindings(session string) error {
|
||||||
// C-b n → gt crew next (switch to next crew session)
|
// C-b n → gt crew next (switch to next crew session)
|
||||||
|
// #{session_name} is expanded by tmux when the key is pressed
|
||||||
if _, err := t.run("bind-key", "-T", "prefix", "n",
|
if _, err := t.run("bind-key", "-T", "prefix", "n",
|
||||||
"run-shell", "gt crew next"); err != nil {
|
"run-shell", "gt crew next --session '#{session_name}'"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
// C-b p → gt crew prev (switch to previous crew session)
|
// C-b p → gt crew prev (switch to previous crew session)
|
||||||
if _, err := t.run("bind-key", "-T", "prefix", "p",
|
if _, err := t.run("bind-key", "-T", "prefix", "p",
|
||||||
"run-shell", "gt crew prev"); err != nil {
|
"run-shell", "gt crew prev --session '#{session_name}'"); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
|
|||||||
Reference in New Issue
Block a user