When Claude sessions were terminated using KillSession(), bash subprocesses spawned by Claude's Bash tool could survive because they ignore SIGHUP. This caused zombie processes to accumulate over time. Changed all critical session termination paths to use KillSessionWithProcesses() which explicitly kills all descendant processes before terminating the session. Fixes: gt-ew3tk Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
477 lines
13 KiB
Go
477 lines
13 KiB
Go
package doctor
|
|
|
|
import (
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"strings"
|
|
|
|
"github.com/steveyegge/gastown/internal/events"
|
|
"github.com/steveyegge/gastown/internal/session"
|
|
"github.com/steveyegge/gastown/internal/tmux"
|
|
)
|
|
|
|
// OrphanSessionCheck detects orphaned tmux sessions that don't match
|
|
// the expected Gas Town session naming patterns.
|
|
type OrphanSessionCheck struct {
|
|
FixableCheck
|
|
sessionLister SessionLister
|
|
orphanSessions []string // Cached during Run for use in Fix
|
|
}
|
|
|
|
// SessionLister abstracts tmux session listing for testing.
|
|
type SessionLister interface {
|
|
ListSessions() ([]string, error)
|
|
}
|
|
|
|
type realSessionLister struct {
|
|
t *tmux.Tmux
|
|
}
|
|
|
|
func (r *realSessionLister) ListSessions() ([]string, error) {
|
|
return r.t.ListSessions()
|
|
}
|
|
|
|
// NewOrphanSessionCheck creates a new orphan session check.
|
|
func NewOrphanSessionCheck() *OrphanSessionCheck {
|
|
return &OrphanSessionCheck{
|
|
FixableCheck: FixableCheck{
|
|
BaseCheck: BaseCheck{
|
|
CheckName: "orphan-sessions",
|
|
CheckDescription: "Detect orphaned tmux sessions",
|
|
CheckCategory: CategoryCleanup,
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
// NewOrphanSessionCheckWithSessionLister creates a check with a custom session lister (for testing).
|
|
func NewOrphanSessionCheckWithSessionLister(lister SessionLister) *OrphanSessionCheck {
|
|
check := NewOrphanSessionCheck()
|
|
check.sessionLister = lister
|
|
return check
|
|
}
|
|
|
|
// Run checks for orphaned Gas Town tmux sessions.
|
|
func (c *OrphanSessionCheck) Run(ctx *CheckContext) *CheckResult {
|
|
lister := c.sessionLister
|
|
if lister == nil {
|
|
lister = &realSessionLister{t: tmux.NewTmux()}
|
|
}
|
|
|
|
sessions, err := lister.ListSessions()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not list tmux sessions",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
if len(sessions) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: "No tmux sessions found",
|
|
}
|
|
}
|
|
|
|
// Get list of valid rigs
|
|
validRigs := c.getValidRigs(ctx.TownRoot)
|
|
|
|
// Get session names for mayor/deacon
|
|
mayorSession := session.MayorSessionName()
|
|
deaconSession := session.DeaconSessionName()
|
|
|
|
// Check each session
|
|
var orphans []string
|
|
var validCount int
|
|
|
|
for _, sess := range sessions {
|
|
if sess == "" {
|
|
continue
|
|
}
|
|
|
|
// Only check gt-* sessions (Gas Town sessions)
|
|
if !strings.HasPrefix(sess, "gt-") {
|
|
continue
|
|
}
|
|
|
|
if c.isValidSession(sess, validRigs, mayorSession, deaconSession) {
|
|
validCount++
|
|
} else {
|
|
orphans = append(orphans, sess)
|
|
}
|
|
}
|
|
|
|
// Cache orphans for Fix
|
|
c.orphanSessions = orphans
|
|
|
|
if len(orphans) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: fmt.Sprintf("All %d Gas Town sessions are valid", validCount),
|
|
}
|
|
}
|
|
|
|
details := make([]string, len(orphans))
|
|
for i, session := range orphans {
|
|
details[i] = fmt.Sprintf("Orphan: %s", session)
|
|
}
|
|
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: fmt.Sprintf("Found %d orphaned session(s)", len(orphans)),
|
|
Details: details,
|
|
FixHint: "Run 'gt doctor --fix' to kill orphaned sessions",
|
|
}
|
|
}
|
|
|
|
// Fix kills all orphaned sessions, except crew sessions which are protected.
|
|
func (c *OrphanSessionCheck) Fix(ctx *CheckContext) error {
|
|
if len(c.orphanSessions) == 0 {
|
|
return nil
|
|
}
|
|
|
|
t := tmux.NewTmux()
|
|
var lastErr error
|
|
|
|
for _, sess := range c.orphanSessions {
|
|
// SAFEGUARD: Never auto-kill crew sessions.
|
|
// Crew workers are human-managed and require explicit action.
|
|
if isCrewSession(sess) {
|
|
continue
|
|
}
|
|
// Log pre-death event for crash investigation (before killing)
|
|
_ = events.LogFeed(events.TypeSessionDeath, sess,
|
|
events.SessionDeathPayload(sess, "unknown", "orphan cleanup", "gt doctor"))
|
|
// Use KillSessionWithProcesses to ensure all descendant processes are killed.
|
|
if err := t.KillSessionWithProcesses(sess); err != nil {
|
|
lastErr = err
|
|
}
|
|
}
|
|
|
|
return lastErr
|
|
}
|
|
|
|
// isCrewSession returns true if the session name matches the crew pattern.
|
|
// Crew sessions are gt-<rig>-crew-<name> and are protected from auto-cleanup.
|
|
func isCrewSession(session string) bool {
|
|
// Pattern: gt-<rig>-crew-<name>
|
|
// Example: gt-gastown-crew-joe
|
|
parts := strings.Split(session, "-")
|
|
if len(parts) >= 4 && parts[0] == "gt" && parts[2] == "crew" {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
// getValidRigs returns a list of valid rig names from the workspace.
|
|
func (c *OrphanSessionCheck) getValidRigs(townRoot string) []string {
|
|
var rigs []string
|
|
|
|
// Read rigs.json if it exists
|
|
rigsPath := filepath.Join(townRoot, "mayor", "rigs.json")
|
|
if _, err := os.Stat(rigsPath); err == nil {
|
|
// For simplicity, just scan directories at town root that look like rigs
|
|
entries, err := os.ReadDir(townRoot)
|
|
if err == nil {
|
|
for _, entry := range entries {
|
|
if entry.IsDir() && entry.Name() != "mayor" && entry.Name() != ".beads" && !strings.HasPrefix(entry.Name(), ".") {
|
|
// Check if it looks like a rig (has polecats/ or crew/ directory)
|
|
polecatsDir := filepath.Join(townRoot, entry.Name(), "polecats")
|
|
crewDir := filepath.Join(townRoot, entry.Name(), "crew")
|
|
if _, err := os.Stat(polecatsDir); err == nil {
|
|
rigs = append(rigs, entry.Name())
|
|
} else if _, err := os.Stat(crewDir); err == nil {
|
|
rigs = append(rigs, entry.Name())
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return rigs
|
|
}
|
|
|
|
// isValidSession checks if a session name matches expected Gas Town patterns.
|
|
// Valid patterns:
|
|
// - gt-{town}-mayor (dynamic based on town name)
|
|
// - gt-{town}-deacon (dynamic based on town name)
|
|
// - gt-<rig>-witness
|
|
// - gt-<rig>-refinery
|
|
// - gt-<rig>-<polecat> (where polecat is any name)
|
|
//
|
|
// Note: We can't verify polecat names without reading state, so we're permissive.
|
|
func (c *OrphanSessionCheck) isValidSession(sess string, validRigs []string, mayorSession, deaconSession string) bool {
|
|
// Mayor session is always valid (dynamic name based on town)
|
|
if mayorSession != "" && sess == mayorSession {
|
|
return true
|
|
}
|
|
|
|
// Deacon session is always valid (dynamic name based on town)
|
|
if deaconSession != "" && sess == deaconSession {
|
|
return true
|
|
}
|
|
|
|
// For rig-specific sessions, extract rig name
|
|
// Pattern: gt-<rig>-<role>
|
|
parts := strings.SplitN(sess, "-", 3)
|
|
if len(parts) < 3 {
|
|
// Invalid format - must be gt-<rig>-<something>
|
|
return false
|
|
}
|
|
|
|
rigName := parts[1]
|
|
|
|
// Check if this rig exists
|
|
rigFound := false
|
|
for _, r := range validRigs {
|
|
if r == rigName {
|
|
rigFound = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !rigFound {
|
|
// Unknown rig - this is an orphan
|
|
return false
|
|
}
|
|
|
|
role := parts[2]
|
|
|
|
// witness and refinery are valid roles
|
|
if role == "witness" || role == "refinery" {
|
|
return true
|
|
}
|
|
|
|
// Any other name is assumed to be a polecat or crew member
|
|
// We can't easily verify without reading state, so accept it
|
|
return true
|
|
}
|
|
|
|
// OrphanProcessCheck detects runtime processes that are not
|
|
// running inside a tmux session. These may be user's personal sessions
|
|
// or legitimately orphaned processes from crashed Gas Town sessions.
|
|
// This check is informational only - it does not auto-fix since we cannot
|
|
// distinguish user sessions from orphaned Gas Town processes.
|
|
type OrphanProcessCheck struct {
|
|
BaseCheck
|
|
}
|
|
|
|
// NewOrphanProcessCheck creates a new orphan process check.
|
|
func NewOrphanProcessCheck() *OrphanProcessCheck {
|
|
return &OrphanProcessCheck{
|
|
BaseCheck: BaseCheck{
|
|
CheckName: "orphan-processes",
|
|
CheckDescription: "Detect runtime processes outside tmux",
|
|
CheckCategory: CategoryCleanup,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Run checks for runtime processes running outside tmux.
|
|
func (c *OrphanProcessCheck) Run(ctx *CheckContext) *CheckResult {
|
|
// Get list of tmux session PIDs
|
|
tmuxPIDs, err := c.getTmuxSessionPIDs()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not get tmux session info",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
// Find runtime processes
|
|
runtimeProcs, err := c.findRuntimeProcesses()
|
|
if err != nil {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: "Could not list runtime processes",
|
|
Details: []string{err.Error()},
|
|
}
|
|
}
|
|
|
|
if len(runtimeProcs) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: "No runtime processes found",
|
|
}
|
|
}
|
|
|
|
// Check which runtime processes are outside tmux
|
|
var outsideTmux []processInfo
|
|
var insideTmux int
|
|
|
|
for _, proc := range runtimeProcs {
|
|
if c.isOrphanProcess(proc, tmuxPIDs) {
|
|
outsideTmux = append(outsideTmux, proc)
|
|
} else {
|
|
insideTmux++
|
|
}
|
|
}
|
|
|
|
if len(outsideTmux) == 0 {
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusOK,
|
|
Message: fmt.Sprintf("All %d runtime processes are inside tmux", insideTmux),
|
|
}
|
|
}
|
|
|
|
details := make([]string, 0, len(outsideTmux)+2)
|
|
details = append(details, "These may be your personal sessions or orphaned Gas Town processes.")
|
|
details = append(details, "Verify these are expected before manually killing any:")
|
|
for _, proc := range outsideTmux {
|
|
details = append(details, fmt.Sprintf(" PID %d: %s (parent: %d)", proc.pid, proc.cmd, proc.ppid))
|
|
}
|
|
|
|
return &CheckResult{
|
|
Name: c.Name(),
|
|
Status: StatusWarning,
|
|
Message: fmt.Sprintf("Found %d runtime process(es) running outside tmux", len(outsideTmux)),
|
|
Details: details,
|
|
}
|
|
}
|
|
|
|
type processInfo struct {
|
|
pid int
|
|
ppid int
|
|
cmd string
|
|
}
|
|
|
|
// getTmuxSessionPIDs returns PIDs of all tmux server processes and pane shell PIDs.
|
|
func (c *OrphanProcessCheck) getTmuxSessionPIDs() (map[int]bool, error) { //nolint:unparam // error return kept for future use
|
|
// Get tmux server PID and all pane PIDs
|
|
pids := make(map[int]bool)
|
|
|
|
// Find tmux server processes using ps instead of pgrep.
|
|
// pgrep -x tmux is unreliable on macOS - it often misses the actual server.
|
|
// We use ps with awk to find processes where comm is exactly "tmux".
|
|
out, err := exec.Command("sh", "-c", `ps ax -o pid,comm | awk '$2 == "tmux" || $2 ~ /\/tmux$/ { print $1 }'`).Output()
|
|
if err != nil {
|
|
// No tmux server running
|
|
return pids, nil
|
|
}
|
|
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
var pid int
|
|
if _, err := fmt.Sscanf(line, "%d", &pid); err == nil {
|
|
pids[pid] = true
|
|
}
|
|
}
|
|
|
|
// Also get shell PIDs inside tmux panes
|
|
t := tmux.NewTmux()
|
|
sessions, _ := t.ListSessions()
|
|
for _, session := range sessions {
|
|
// Get pane PIDs for this session
|
|
out, err := exec.Command("tmux", "list-panes", "-t", session, "-F", "#{pane_pid}").Output()
|
|
if err != nil {
|
|
continue
|
|
}
|
|
for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") {
|
|
var pid int
|
|
if _, err := fmt.Sscanf(line, "%d", &pid); err == nil {
|
|
pids[pid] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
return pids, nil
|
|
}
|
|
|
|
// findRuntimeProcesses finds Gas Town Claude processes (those with --dangerously-skip-permissions).
|
|
// Only detects processes started by Gas Town, not user's personal Claude sessions.
|
|
func (c *OrphanProcessCheck) findRuntimeProcesses() ([]processInfo, error) {
|
|
var procs []processInfo
|
|
|
|
// Use ps with args to get full command line (needed to check for Gas Town signature)
|
|
out, err := exec.Command("ps", "-eo", "pid,ppid,args").Output()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
for _, line := range strings.Split(string(out), "\n") {
|
|
fields := strings.Fields(line)
|
|
if len(fields) < 3 {
|
|
continue
|
|
}
|
|
|
|
// Extract command name (without path)
|
|
cmd := fields[2]
|
|
if idx := strings.LastIndex(cmd, "/"); idx >= 0 {
|
|
cmd = cmd[idx+1:]
|
|
}
|
|
|
|
// Only match claude/codex processes, not tmux or other launchers
|
|
// (tmux command line may contain --dangerously-skip-permissions as part of the launched command)
|
|
if cmd != "claude" && cmd != "claude-code" && cmd != "codex" {
|
|
continue
|
|
}
|
|
|
|
// Get full args
|
|
args := strings.Join(fields[2:], " ")
|
|
|
|
// Only match Gas Town Claude processes (have --dangerously-skip-permissions)
|
|
// This excludes user's personal Claude sessions
|
|
if !strings.Contains(args, "--dangerously-skip-permissions") {
|
|
continue
|
|
}
|
|
|
|
var pid, ppid int
|
|
if _, err := fmt.Sscanf(fields[0], "%d", &pid); err != nil {
|
|
continue
|
|
}
|
|
if _, err := fmt.Sscanf(fields[1], "%d", &ppid); err != nil {
|
|
continue
|
|
}
|
|
|
|
procs = append(procs, processInfo{
|
|
pid: pid,
|
|
ppid: ppid,
|
|
cmd: args,
|
|
})
|
|
}
|
|
|
|
return procs, nil
|
|
}
|
|
|
|
// isOrphanProcess checks if a runtime process is orphaned.
|
|
// A process is orphaned if its parent (or ancestor) is not a tmux session.
|
|
func (c *OrphanProcessCheck) isOrphanProcess(proc processInfo, tmuxPIDs map[int]bool) bool {
|
|
// Walk up the process tree looking for a tmux parent
|
|
currentPPID := proc.ppid
|
|
visited := make(map[int]bool)
|
|
|
|
for currentPPID > 1 && !visited[currentPPID] {
|
|
visited[currentPPID] = true
|
|
|
|
// Check if this is a tmux process
|
|
if tmuxPIDs[currentPPID] {
|
|
return false // Has tmux ancestor, not orphaned
|
|
}
|
|
|
|
// Get parent's parent
|
|
out, err := exec.Command("ps", "-p", fmt.Sprintf("%d", currentPPID), "-o", "ppid=").Output() //nolint:gosec // G204: PID is numeric from internal state
|
|
if err != nil {
|
|
break
|
|
}
|
|
|
|
var nextPPID int
|
|
if _, err := fmt.Sscanf(strings.TrimSpace(string(out)), "%d", &nextPPID); err != nil {
|
|
break
|
|
}
|
|
currentPPID = nextPPID
|
|
}
|
|
|
|
return true // No tmux ancestor found
|
|
}
|