Crew workers are human-managed and should never be auto-killed by gt doctor --fix. This adds defense-in-depth protection: 1. OrphanSessionCheck.Fix() now skips any session matching the gt-<rig>-crew-<name> pattern 2. OrphanProcessCheck.Fix() now checks if a process has a crew session pane as an ancestor before killing it Even if detection fails (like the pgrep bug we just fixed), crew sessions and their processes will be protected. Generated with Claude Code Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
546 lines
13 KiB
Go
546 lines
13 KiB
Go
package doctor
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/steveyegge/gastown/internal/tmux"
|
|
)
|
|
|
|
// OrphanSessionCheck detects orphaned tmux sessions that don't match
|
|
// the expected Gas Town session naming patterns.
|
|
type OrphanSessionCheck struct {
|
|
FixableCheck
|
|
orphanSessions []string // Cached during Run for use in Fix
|
|
}
|
|
|
|
// NewOrphanSessionCheck creates a new orphan session check.
|
|
func NewOrphanSessionCheck() *OrphanSessionCheck {
|
|
return &OrphanSessionCheck{
|
|
FixableCheck: FixableCheck{
|
|
BaseCheck: BaseCheck{
|
|
CheckName: "orphan-sessions",
|
|
CheckDescription: "Detect orphaned tmux sessions",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// Run checks for orphaned Gas Town tmux sessions.
|
|
func (c *OrphanSessionCheck) Run(ctx *CheckContext) *CheckResult {
|
|
t := tmux.NewTmux()
|
|
|
|
sessions, err := t.ListSessions()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not list tmux sessions",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
if len(sessions) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: "No tmux sessions found",
|
|
}
|
|
}
|
|
|
|
// Get list of valid rigs
|
|
validRigs := c.getValidRigs(ctx.TownRoot)
|
|
|
|
// Check each session
|
|
var orphans []string
|
|
var validCount int
|
|
|
|
for _, session := range sessions {
|
|
if session == "" {
|
|
continue
|
|
}
|
|
|
|
// Only check gt-* sessions (Gas Town sessions)
|
|
if !strings.HasPrefix(session, "gt-") {
|
|
continue
|
|
}
|
|
|
|
if c.isValidSession(session, validRigs) {
|
|
validCount++
|
|
} else {
|
|
orphans = append(orphans, session)
|
|
}
|
|
}
|
|
|
|
// Cache orphans for Fix
|
|
c.orphanSessions = orphans
|
|
|
|
if len(orphans) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: fmt.Sprintf("All %d Gas Town sessions are valid", validCount),
|
|
}
|
|
}
|
|
|
|
details := make([]string, len(orphans))
|
|
for i, session := range orphans {
|
|
details[i] = fmt.Sprintf("Orphan: %s", session)
|
|
}
|
|
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: fmt.Sprintf("Found %d orphaned session(s)", len(orphans)),
|
|
Details: details,
|
|
FixHint: "Run 'gt doctor --fix' to kill orphaned sessions",
|
|
}
|
|
}
|
|
|
|
// Fix kills all orphaned sessions, except crew sessions which are protected.
|
|
func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
|
|
if len(c.orphanSessions) == 0 {
|
|
return nil
|
|
}
|
|
|
|
t := tmux.NewTmux()
|
|
var lastErr error
|
|
|
|
for _, session := range c.orphanSessions {
|
|
// SAFEGUARD: Never auto-kill crew sessions.
|
|
// Crew workers are human-managed and require explicit action.
|
|
if isCrewSession(session) {
|
|
continue
|
|
}
|
|
if err := t.KillSession(session); err != nil {
|
|
lastErr = err
|
|
}
|
|
}
|
|
|
|
return lastErr
|
|
}
|
|
|
|
// isCrewSession returns true if the session name matches the crew pattern.
|
|
// Crew sessions are gt-<rig>-crew-<name> and are protected from auto-cleanup.
|
|
func isCrewSession(session string) bool {
|
|
// Pattern: gt-<rig>-crew-<name>
|
|
// Example: gt-gastown-crew-joe
|
|
parts := strings.Split(session, "-")
|
|
if len(parts) >= 4 && parts[0] == "gt" && parts[2] == "crew" {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// getValidRigs returns a list of valid rig names from the workspace.
|
|
func (c *OrphanSessionCheck) getValidRigs(townRoot string) []string {
|
|
var rigs []string
|
|
|
|
// Read rigs.json if it exists
|
|
rigsPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
if _, err := os.Stat(rigsPath); err == nil {
|
|
// For simplicity, just scan directories at town root that look like rigs
|
|
entries, err := os.ReadDir(townRoot)
|
|
if err == nil {
|
|
for _, entry := range entries {
|
|
if entry.IsDir() && entry.Name() != "mayor" && entry.Name() != ".beads" && !strings.HasPrefix(entry.Name(), ".") {
|
|
// Check if it looks like a rig (has polecats/ or crew/ directory)
|
|
polecatsDir := filepath.Join(townRoot, entry.Name(), "polecats")
|
|
crewDir := filepath.Join(townRoot, entry.Name(), "crew")
|
|
if _, err := os.Stat(polecatsDir); err == nil {
|
|
rigs = append(rigs, entry.Name())
|
|
} else if _, err := os.Stat(crewDir); err == nil {
|
|
rigs = append(rigs, entry.Name())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return rigs
|
|
}
|
|
|
|
// isValidSession checks if a session name matches expected Gas Town patterns.
|
|
// Valid patterns:
|
|
// - gt-mayor
|
|
// - gt-deacon
|
|
// - gt-<rig>-witness
|
|
// - gt-<rig>-refinery
|
|
// - gt-<rig>-<polecat> (where polecat is any name)
|
|
//
|
|
// Note: We can't verify polecat names without reading state, so we're permissive.
|
|
func (c *OrphanSessionCheck) isValidSession(session string, validRigs []string) bool {
|
|
// gt-mayor is always valid
|
|
if session == "gt-mayor" {
|
|
return true
|
|
}
|
|
|
|
// gt-deacon is always valid
|
|
if session == "gt-deacon" {
|
|
return true
|
|
}
|
|
|
|
// For rig-specific sessions, extract rig name
|
|
// Pattern: gt-<rig>-<role>
|
|
parts := strings.SplitN(session, "-", 3)
|
|
if len(parts) < 3 {
|
|
// Invalid format - must be gt-<rig>-<something>
|
|
return false
|
|
}
|
|
|
|
rigName := parts[1]
|
|
|
|
// Check if this rig exists
|
|
rigFound := false
|
|
for _, r := range validRigs {
|
|
if r == rigName {
|
|
rigFound = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !rigFound {
|
|
// Unknown rig - this is an orphan
|
|
return false
|
|
}
|
|
|
|
role := parts[2]
|
|
|
|
// witness and refinery are valid roles
|
|
if role == "witness" || role == "refinery" {
|
|
return true
|
|
}
|
|
|
|
// Any other name is assumed to be a polecat or crew member
|
|
// We can't easily verify without reading state, so accept it
|
|
return true
|
|
}
|
|
|
|
// OrphanProcessCheck detects orphaned Claude/claude-code processes
|
|
// that are not associated with a Gas Town tmux session.
|
|
type OrphanProcessCheck struct {
|
|
FixableCheck
|
|
orphanPIDs []int // Cached during Run for use in Fix
|
|
}
|
|
|
|
// NewOrphanProcessCheck creates a new orphan process check.
|
|
func NewOrphanProcessCheck() *OrphanProcessCheck {
|
|
return &OrphanProcessCheck{
|
|
FixableCheck: FixableCheck{
|
|
BaseCheck: BaseCheck{
|
|
CheckName: "orphan-processes",
|
|
CheckDescription: "Detect orphaned Claude processes",
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// Run checks for orphaned Claude processes.
|
|
func (c *OrphanProcessCheck) Run(ctx *CheckContext) *CheckResult {
|
|
// Get list of tmux session PIDs
|
|
tmuxPIDs, err := c.getTmuxSessionPIDs()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not get tmux session info",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
// Find Claude processes
|
|
claudeProcs, err := c.findClaudeProcesses()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not list Claude processes",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
if len(claudeProcs) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: "No Claude processes found",
|
|
}
|
|
}
|
|
|
|
// Check which Claude processes are orphaned
|
|
var orphans []processInfo
|
|
var validCount int
|
|
|
|
for _, proc := range claudeProcs {
|
|
if c.isOrphanProcess(proc, tmuxPIDs) {
|
|
orphans = append(orphans, proc)
|
|
} else {
|
|
validCount++
|
|
}
|
|
}
|
|
|
|
// Cache orphan PIDs for Fix
|
|
c.orphanPIDs = make([]int, len(orphans))
|
|
for i, p := range orphans {
|
|
c.orphanPIDs[i] = p.pid
|
|
}
|
|
|
|
if len(orphans) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: fmt.Sprintf("All %d Claude processes have valid parents", validCount),
|
|
}
|
|
}
|
|
|
|
details := make([]string, len(orphans))
|
|
for i, proc := range orphans {
|
|
details[i] = fmt.Sprintf("PID %d: %s (parent: %d)", proc.pid, proc.cmd, proc.ppid)
|
|
}
|
|
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: fmt.Sprintf("Found %d orphaned Claude process(es)", len(orphans)),
|
|
Details: details,
|
|
FixHint: "Run 'gt doctor --fix' to kill orphaned processes",
|
|
}
|
|
}
|
|
|
|
// Fix kills orphaned processes, with safeguards for crew sessions.
|
|
func (c *OrphanProcessCheck) Fix(ctx *CheckContext) error {
|
|
if len(c.orphanPIDs) == 0 {
|
|
return nil
|
|
}
|
|
|
|
// SAFEGUARD: Get crew session pane PIDs to avoid killing crew processes.
|
|
// Even if a process appears orphaned, if its parent is a crew session pane,
|
|
// we should not kill it (the detection might be wrong).
|
|
crewPanePIDs := c.getCrewSessionPanePIDs()
|
|
|
|
var lastErr error
|
|
for _, pid := range c.orphanPIDs {
|
|
// Check if this process has a crew session ancestor
|
|
if c.hasCrewAncestor(pid, crewPanePIDs) {
|
|
// Skip - this process might belong to a crew session
|
|
continue
|
|
}
|
|
|
|
proc, err := os.FindProcess(pid)
|
|
if err != nil {
|
|
lastErr = err
|
|
continue
|
|
}
|
|
if err := proc.Signal(os.Interrupt); err != nil {
|
|
// Try SIGKILL if SIGINT fails
|
|
if killErr := proc.Kill(); killErr != nil {
|
|
lastErr = killErr
|
|
}
|
|
}
|
|
}
|
|
|
|
return lastErr
|
|
}
|
|
|
|
// getCrewSessionPanePIDs returns pane PIDs for all crew sessions.
|
|
func (c *OrphanProcessCheck) getCrewSessionPanePIDs() map[int]bool {
|
|
pids := make(map[int]bool)
|
|
|
|
t := tmux.NewTmux()
|
|
sessions, err := t.ListSessions()
|
|
if err != nil {
|
|
return pids
|
|
}
|
|
|
|
for _, session := range sessions {
|
|
if !isCrewSession(session) {
|
|
continue
|
|
}
|
|
// Get pane PIDs for this crew session
|
|
out, err := exec.Command("tmux", "list-panes", "-t", session, "-F", "#{pane_pid}").Output()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
var pid int
|
|
if _, err := fmt.Sscanf(line, "%d", &pid); err == nil {
|
|
pids[pid] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return pids
|
|
}
|
|
|
|
// hasCrewAncestor checks if a process has a crew session pane as an ancestor.
|
|
func (c *OrphanProcessCheck) hasCrewAncestor(pid int, crewPanePIDs map[int]bool) bool {
|
|
if len(crewPanePIDs) == 0 {
|
|
return false
|
|
}
|
|
|
|
// Walk up the process tree
|
|
currentPID := pid
|
|
visited := make(map[int]bool)
|
|
|
|
for currentPID > 1 && !visited[currentPID] {
|
|
visited[currentPID] = true
|
|
|
|
// Check if this PID is a crew pane
|
|
if crewPanePIDs[currentPID] {
|
|
return true
|
|
}
|
|
|
|
// Get parent PID
|
|
out, err := exec.Command("ps", "-p", fmt.Sprintf("%d", currentPID), "-o", "ppid=").Output()
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
var ppid int
|
|
if _, err := fmt.Sscanf(strings.TrimSpace(string(out)), "%d", &ppid); err != nil {
|
|
break
|
|
}
|
|
currentPID = ppid
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
type processInfo struct {
|
|
pid int
|
|
ppid int
|
|
cmd string
|
|
}
|
|
|
|
// getTmuxSessionPIDs returns PIDs of all tmux server processes and pane shell PIDs.
|
|
func (c *OrphanProcessCheck) getTmuxSessionPIDs() (map[int]bool, error) {
|
|
// Get tmux server PID and all pane PIDs
|
|
pids := make(map[int]bool)
|
|
|
|
// Find tmux server processes using ps instead of pgrep.
|
|
// pgrep -x tmux is unreliable on macOS - it often misses the actual server.
|
|
// We use ps with awk to find processes where comm is exactly "tmux".
|
|
out, err := exec.Command("sh", "-c", `ps ax -o pid,comm | awk '$2 == "tmux" || $2 ~ /\/tmux$/ { print $1 }'`).Output()
|
|
if err != nil {
|
|
// No tmux server running
|
|
return pids, nil
|
|
}
|
|
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
var pid int
|
|
if _, err := fmt.Sscanf(line, "%d", &pid); err == nil {
|
|
pids[pid] = true
|
|
}
|
|
}
|
|
|
|
// Also get shell PIDs inside tmux panes
|
|
t := tmux.NewTmux()
|
|
sessions, _ := t.ListSessions()
|
|
for _, session := range sessions {
|
|
// Get pane PIDs for this session
|
|
out, err := exec.Command("tmux", "list-panes", "-t", session, "-F", "#{pane_pid}").Output()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
var pid int
|
|
if _, err := fmt.Sscanf(line, "%d", &pid); err == nil {
|
|
pids[pid] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return pids, nil
|
|
}
|
|
|
|
// findClaudeProcesses finds all running claude/claude-code CLI processes.
|
|
// Excludes Claude.app desktop application and its helpers.
|
|
func (c *OrphanProcessCheck) findClaudeProcesses() ([]processInfo, error) {
|
|
var procs []processInfo
|
|
|
|
// Use ps to find claude processes
|
|
// Look for both "claude" and "claude-code" in command
|
|
out, err := exec.Command("ps", "-eo", "pid,ppid,comm").Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Regex to match claude CLI processes (not Claude.app)
|
|
// Match: "claude" or paths ending in "/claude"
|
|
claudePattern := regexp.MustCompile(`(?i)(^claude$|/claude$)`)
|
|
|
|
// Pattern to exclude Claude.app and related desktop processes
|
|
excludePattern := regexp.MustCompile(`(?i)(Claude\.app|claude-native|chrome-native)`)
|
|
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 3 {
|
|
continue
|
|
}
|
|
|
|
// Check if command matches claude CLI
|
|
cmd := strings.Join(fields[2:], " ")
|
|
|
|
// Skip desktop app processes
|
|
if excludePattern.MatchString(cmd) {
|
|
continue
|
|
}
|
|
|
|
// Only match CLI claude processes
|
|
if !claudePattern.MatchString(cmd) {
|
|
continue
|
|
}
|
|
|
|
var pid, ppid int
|
|
if _, err := fmt.Sscanf(fields[0], "%d", &pid); err != nil {
|
|
continue
|
|
}
|
|
if _, err := fmt.Sscanf(fields[1], "%d", &ppid); err != nil {
|
|
continue
|
|
}
|
|
|
|
procs = append(procs, processInfo{
|
|
pid: pid,
|
|
ppid: ppid,
|
|
cmd: cmd,
|
|
})
|
|
}
|
|
|
|
return procs, nil
|
|
}
|
|
|
|
// isOrphanProcess checks if a Claude process is orphaned.
|
|
// A process is orphaned if its parent (or ancestor) is not a tmux session.
|
|
func (c *OrphanProcessCheck) isOrphanProcess(proc processInfo, tmuxPIDs map[int]bool) bool {
|
|
// Walk up the process tree looking for a tmux parent
|
|
currentPPID := proc.ppid
|
|
visited := make(map[int]bool)
|
|
|
|
for currentPPID > 1 && !visited[currentPPID] {
|
|
visited[currentPPID] = true
|
|
|
|
// Check if this is a tmux process
|
|
if tmuxPIDs[currentPPID] {
|
|
return false // Has tmux ancestor, not orphaned
|
|
}
|
|
|
|
// Get parent's parent
|
|
out, err := exec.Command("ps", "-p", fmt.Sprintf("%d", currentPPID), "-o", "ppid=").Output()
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
var nextPPID int
|
|
if _, err := fmt.Sscanf(strings.TrimSpace(string(out)), "%d", &nextPPID); err != nil {
|
|
break
|
|
}
|
|
currentPPID = nextPPID
|
|
}
|
|
|
|
return true // No tmux ancestor found
|
|
}
|