diff --git a/.beads/formulas/mol-deacon-patrol.formula.toml b/.beads/formulas/mol-deacon-patrol.formula.toml index de62749c..1c357490 100644 --- a/.beads/formulas/mol-deacon-patrol.formula.toml +++ b/.beads/formulas/mol-deacon-patrol.formula.toml @@ -84,10 +84,46 @@ Callbacks may spawn new polecats, update issue state, or trigger other actions. **Hygiene principle**: Archive messages after they're fully processed. Keep inbox near-empty - only unprocessed items should remain.""" +[[steps]] +id = "orphan-process-cleanup" +title = "Clean up orphaned claude subagent processes" +needs = ["inbox-check"] +description = """ +Clean up orphaned claude subagent processes. + +Claude Code's Task tool spawns subagent processes that sometimes don't clean up +properly after completion. These accumulate and consume significant memory. + +**Detection method:** +Orphaned processes have no controlling terminal (TTY = "?"). Legitimate claude +instances in terminals have a TTY like "pts/0". + +**Run cleanup:** +```bash +gt deacon cleanup-orphans +``` + +This command: +1. Lists all claude/codex processes with `ps -eo pid,tty,comm` +2. Filters for TTY = "?" (no controlling terminal) +3. Sends SIGTERM to each orphaned process +4. Reports how many were killed + +**Why this is safe:** +- Processes in terminals (your personal sessions) have a TTY - they won't be touched +- Only kills processes that have no controlling terminal +- These orphans are children of the tmux server with no TTY, indicating they're + detached subagents that failed to exit + +**If cleanup fails:** +Log the error but continue patrol - this is best-effort cleanup. + +**Exit criteria:** Orphan cleanup attempted (success or logged failure).""" + [[steps]] id = "trigger-pending-spawns" title = "Nudge newly spawned polecats" -needs = ["inbox-check"] +needs = ["orphan-process-cleanup"] description = """ Nudge newly spawned polecats that are ready for input. diff --git a/internal/cmd/deacon.go b/internal/cmd/deacon.go index 8767882e..e0720d08 100644 --- a/internal/cmd/deacon.go +++ b/internal/cmd/deacon.go @@ -22,6 +22,7 @@ import ( "github.com/steveyegge/gastown/internal/session" "github.com/steveyegge/gastown/internal/style" "github.com/steveyegge/gastown/internal/tmux" + "github.com/steveyegge/gastown/internal/util" "github.com/steveyegge/gastown/internal/workspace" ) @@ -236,6 +237,27 @@ This removes the pause file and allows the Deacon to work normally.`, RunE: runDeaconResume, } +var deaconCleanupOrphansCmd = &cobra.Command{ + Use: "cleanup-orphans", + Short: "Clean up orphaned claude subagent processes", + Long: `Clean up orphaned claude subagent processes. + +Claude Code's Task tool spawns subagent processes that sometimes don't clean up +properly after completion. These accumulate and consume significant memory. + +Detection is based on TTY column: processes with TTY "?" have no controlling +terminal. Legitimate claude instances in terminals have a TTY like "pts/0". + +This is safe because: +- Processes in terminals (your personal sessions) have a TTY - won't be touched +- Only kills processes that have no controlling terminal +- These orphans are children of the tmux server with no TTY + +Example: + gt deacon cleanup-orphans`, + RunE: runDeaconCleanupOrphans, +} + var ( triggerTimeout time.Duration @@ -270,6 +292,7 @@ func init() { deaconCmd.AddCommand(deaconStaleHooksCmd) deaconCmd.AddCommand(deaconPauseCmd) deaconCmd.AddCommand(deaconResumeCmd) + deaconCmd.AddCommand(deaconCleanupOrphansCmd) // Flags for trigger-pending deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second, @@ -1105,3 +1128,54 @@ func runDeaconResume(cmd *cobra.Command, args []string) error { return nil } + +// runDeaconCleanupOrphans cleans up orphaned claude subagent processes. +func runDeaconCleanupOrphans(cmd *cobra.Command, args []string) error { + // First, find orphans + orphans, err := util.FindOrphanedClaudeProcesses() + if err != nil { + return fmt.Errorf("finding orphaned processes: %w", err) + } + + if len(orphans) == 0 { + fmt.Printf("%s No orphaned claude processes found\n", style.Dim.Render("○")) + return nil + } + + fmt.Printf("%s Found %d orphaned claude process(es)\n", style.Bold.Render("●"), len(orphans)) + + // Process them with signal escalation + results, err := util.CleanupOrphanedClaudeProcesses() + if err != nil { + style.PrintWarning("cleanup had errors: %v", err) + } + + // Report results + var terminated, escalated, unkillable int + for _, r := range results { + switch r.Signal { + case "SIGTERM": + fmt.Printf(" %s Sent SIGTERM to PID %d (%s)\n", style.Bold.Render("→"), r.Process.PID, r.Process.Cmd) + terminated++ + case "SIGKILL": + fmt.Printf(" %s Escalated to SIGKILL for PID %d (%s)\n", style.Bold.Render("!"), r.Process.PID, r.Process.Cmd) + escalated++ + case "UNKILLABLE": + fmt.Printf(" %s WARNING: PID %d (%s) survived SIGKILL\n", style.Bold.Render("⚠"), r.Process.PID, r.Process.Cmd) + unkillable++ + } + } + + if len(results) > 0 { + summary := fmt.Sprintf("Processed %d orphan(s)", len(results)) + if escalated > 0 { + summary += fmt.Sprintf(" (%d escalated to SIGKILL)", escalated) + } + if unkillable > 0 { + summary += fmt.Sprintf(" (%d unkillable)", unkillable) + } + fmt.Printf("%s %s\n", style.Bold.Render("✓"), summary) + } + + return nil +} diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index a498a780..3f3b6a6b 100755 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -28,6 +28,7 @@ import ( "github.com/steveyegge/gastown/internal/rig" "github.com/steveyegge/gastown/internal/session" "github.com/steveyegge/gastown/internal/tmux" + "github.com/steveyegge/gastown/internal/util" "github.com/steveyegge/gastown/internal/wisp" "github.com/steveyegge/gastown/internal/witness" ) @@ -268,6 +269,11 @@ func (d *Daemon) heartbeat(state *State) { // This validates tmux sessions are still alive for polecats with work-on-hook d.checkPolecatSessionHealth() + // 12. Clean up orphaned claude subagent processes (memory leak prevention) + // These are Task tool subagents that didn't clean up after completion. + // This is a safety net - Deacon patrol also does this more frequently. + d.cleanupOrphanedProcesses() + // Update state state.LastHeartbeat = time.Now() state.HeartbeatCount++ @@ -980,3 +986,26 @@ Manual intervention may be required.`, d.logger.Printf("Warning: failed to notify witness of crashed polecat: %v", err) } } + +// cleanupOrphanedProcesses kills orphaned claude subagent processes. +// These are Task tool subagents that didn't clean up after completion. +// Detection uses TTY column: processes with TTY "?" have no controlling terminal. +// This is a safety net fallback - Deacon patrol also runs this more frequently. +func (d *Daemon) cleanupOrphanedProcesses() { + results, err := util.CleanupOrphanedClaudeProcesses() + if err != nil { + d.logger.Printf("Warning: orphan process cleanup failed: %v", err) + return + } + + if len(results) > 0 { + d.logger.Printf("Orphan cleanup: processed %d process(es)", len(results)) + for _, r := range results { + if r.Signal == "UNKILLABLE" { + d.logger.Printf(" WARNING: PID %d (%s) survived SIGKILL", r.Process.PID, r.Process.Cmd) + } else { + d.logger.Printf(" Sent %s to PID %d (%s)", r.Signal, r.Process.PID, r.Process.Cmd) + } + } + } +} diff --git a/internal/util/orphan.go b/internal/util/orphan.go new file mode 100644 index 00000000..4f809566 --- /dev/null +++ b/internal/util/orphan.go @@ -0,0 +1,332 @@ +//go:build !windows + +package util + +import ( + "bufio" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" +) + +// minOrphanAge is the minimum age (in seconds) a process must be before +// we consider it orphaned. This prevents race conditions with newly spawned +// processes and avoids killing legitimate short-lived subagents. +const minOrphanAge = 60 + +// sigkillGracePeriod is how long (in seconds) we wait after sending SIGTERM +// before escalating to SIGKILL. If a process was sent SIGTERM and is still +// around after this period, we use SIGKILL on the next cleanup cycle. +const sigkillGracePeriod = 60 + +// orphanStateFile returns the path to the state file that tracks PIDs we've +// sent signals to. Uses $XDG_RUNTIME_DIR if available, otherwise /tmp. +func orphanStateFile() string { + dir := os.Getenv("XDG_RUNTIME_DIR") + if dir == "" { + dir = "/tmp" + } + return filepath.Join(dir, "gastown-orphan-state") +} + +// signalState tracks what signal was last sent to a PID and when. +type signalState struct { + Signal string // "SIGTERM" or "SIGKILL" + Timestamp time.Time // When the signal was sent +} + +// loadOrphanState reads the state file and returns the current signal state +// for each tracked PID. Automatically cleans up entries for dead processes. +func loadOrphanState() map[int]signalState { + state := make(map[int]signalState) + + f, err := os.Open(orphanStateFile()) + if err != nil { + return state // File doesn't exist yet, that's fine + } + defer f.Close() + + scanner := bufio.NewScanner(f) + for scanner.Scan() { + parts := strings.Fields(scanner.Text()) + if len(parts) != 3 { + continue + } + pid, err := strconv.Atoi(parts[0]) + if err != nil { + continue + } + sig := parts[1] + ts, err := strconv.ParseInt(parts[2], 10, 64) + if err != nil { + continue + } + + // Only keep if process still exists + if err := syscall.Kill(pid, 0); err == nil || err == syscall.EPERM { + state[pid] = signalState{Signal: sig, Timestamp: time.Unix(ts, 0)} + } + } + + return state +} + +// saveOrphanState writes the current signal state to the state file. +func saveOrphanState(state map[int]signalState) error { + f, err := os.Create(orphanStateFile()) + if err != nil { + return err + } + defer f.Close() + + for pid, s := range state { + fmt.Fprintf(f, "%d %s %d\n", pid, s.Signal, s.Timestamp.Unix()) + } + return nil +} + +// processExists checks if a process is still running. +func processExists(pid int) bool { + err := syscall.Kill(pid, 0) + return err == nil || err == syscall.EPERM +} + +// parseEtime parses ps etime format into seconds. +// Format: [[DD-]HH:]MM:SS +// Examples: "01:23" (83s), "01:02:03" (3723s), "2-01:02:03" (176523s) +func parseEtime(etime string) (int, error) { + var days, hours, minutes, seconds int + + // Check for days component (DD-HH:MM:SS) + if idx := strings.Index(etime, "-"); idx != -1 { + d, err := strconv.Atoi(etime[:idx]) + if err != nil { + return 0, fmt.Errorf("parsing days: %w", err) + } + days = d + etime = etime[idx+1:] + } + + // Split remaining by colons + parts := strings.Split(etime, ":") + switch len(parts) { + case 2: // MM:SS + m, err := strconv.Atoi(parts[0]) + if err != nil { + return 0, fmt.Errorf("parsing minutes: %w", err) + } + s, err := strconv.Atoi(parts[1]) + if err != nil { + return 0, fmt.Errorf("parsing seconds: %w", err) + } + minutes, seconds = m, s + case 3: // HH:MM:SS + h, err := strconv.Atoi(parts[0]) + if err != nil { + return 0, fmt.Errorf("parsing hours: %w", err) + } + m, err := strconv.Atoi(parts[1]) + if err != nil { + return 0, fmt.Errorf("parsing minutes: %w", err) + } + s, err := strconv.Atoi(parts[2]) + if err != nil { + return 0, fmt.Errorf("parsing seconds: %w", err) + } + hours, minutes, seconds = h, m, s + default: + return 0, fmt.Errorf("unexpected etime format: %s", etime) + } + + return days*86400 + hours*3600 + minutes*60 + seconds, nil +} + +// OrphanedProcess represents a claude process running without a controlling terminal. +type OrphanedProcess struct { + PID int + Cmd string + Age int // Age in seconds +} + +// FindOrphanedClaudeProcesses finds claude/codex processes without a controlling terminal. +// These are typically subagent processes spawned by Claude Code's Task tool that didn't +// clean up properly after completion. +// +// Detection is based on TTY column: processes with TTY "?" have no controlling terminal. +// This is safer than process tree walking because: +// - Legitimate terminal sessions always have a TTY (pts/*) +// - Orphaned subagents have no TTY (?) +// - Won't accidentally kill user's personal claude instances in terminals +// +// Additionally, processes must be older than minOrphanAge seconds to be considered +// orphaned. This prevents race conditions with newly spawned processes. +func FindOrphanedClaudeProcesses() ([]OrphanedProcess, error) { + // Use ps to get PID, TTY, command, and elapsed time for all processes + // TTY "?" indicates no controlling terminal + // etime is elapsed time in [[DD-]HH:]MM:SS format (portable across Linux/macOS) + out, err := exec.Command("ps", "-eo", "pid,tty,comm,etime").Output() + if err != nil { + return nil, fmt.Errorf("listing processes: %w", err) + } + + var orphans []OrphanedProcess + for _, line := range strings.Split(string(out), "\n") { + fields := strings.Fields(line) + if len(fields) < 4 { + continue + } + + pid, err := strconv.Atoi(fields[0]) + if err != nil { + continue // Header line or invalid PID + } + + tty := fields[1] + cmd := fields[2] + etimeStr := fields[3] + + // Only look for claude/codex processes without a TTY + if tty != "?" { + continue + } + + // Match claude or codex command names + cmdLower := strings.ToLower(cmd) + if cmdLower != "claude" && cmdLower != "claude-code" && cmdLower != "codex" { + continue + } + + // Skip processes younger than minOrphanAge seconds + // This prevents killing newly spawned subagents and reduces false positives + age, err := parseEtime(etimeStr) + if err != nil { + continue + } + if age < minOrphanAge { + continue + } + + orphans = append(orphans, OrphanedProcess{ + PID: pid, + Cmd: cmd, + Age: age, + }) + } + + return orphans, nil +} + +// CleanupResult describes what happened to an orphaned process. +type CleanupResult struct { + Process OrphanedProcess + Signal string // "SIGTERM", "SIGKILL", or "UNKILLABLE" + Error error +} + +// CleanupOrphanedClaudeProcesses finds and kills orphaned claude/codex processes. +// +// Uses a state machine to escalate signals: +// 1. First encounter → SIGTERM, record in state file +// 2. Next cycle, still alive after grace period → SIGKILL, update state +// 3. Next cycle, still alive after SIGKILL → log as unkillable, remove from state +// +// Returns the list of cleanup results and any error encountered. +func CleanupOrphanedClaudeProcesses() ([]CleanupResult, error) { + orphans, err := FindOrphanedClaudeProcesses() + if err != nil { + return nil, err + } + + // Load previous state + state := loadOrphanState() + now := time.Now() + + var results []CleanupResult + var lastErr error + + // Track which PIDs we're still working on + activeOrphans := make(map[int]bool) + for _, o := range orphans { + activeOrphans[o.PID] = true + } + + // First pass: check state for PIDs that died (cleanup) or need escalation + for pid, s := range state { + if !activeOrphans[pid] { + // Process died, remove from state + delete(state, pid) + continue + } + + // Process still alive - check if we need to escalate + elapsed := now.Sub(s.Timestamp).Seconds() + + if s.Signal == "SIGKILL" { + // Already sent SIGKILL and it's still alive - unkillable + results = append(results, CleanupResult{ + Process: OrphanedProcess{PID: pid, Cmd: "claude"}, + Signal: "UNKILLABLE", + Error: fmt.Errorf("process %d survived SIGKILL", pid), + }) + delete(state, pid) // Remove from tracking, nothing more we can do + delete(activeOrphans, pid) + continue + } + + if s.Signal == "SIGTERM" && elapsed >= float64(sigkillGracePeriod) { + // Sent SIGTERM but still alive after grace period - escalate to SIGKILL + if err := syscall.Kill(pid, syscall.SIGKILL); err != nil { + if err != syscall.ESRCH { + lastErr = fmt.Errorf("SIGKILL PID %d: %w", pid, err) + } + delete(state, pid) + delete(activeOrphans, pid) + continue + } + state[pid] = signalState{Signal: "SIGKILL", Timestamp: now} + results = append(results, CleanupResult{ + Process: OrphanedProcess{PID: pid, Cmd: "claude"}, + Signal: "SIGKILL", + }) + delete(activeOrphans, pid) + } + // If SIGTERM was recent, leave it alone - check again next cycle + } + + // Second pass: send SIGTERM to new orphans not yet in state + for _, orphan := range orphans { + if !activeOrphans[orphan.PID] { + continue // Already handled above + } + if _, exists := state[orphan.PID]; exists { + continue // Already in state, waiting for grace period + } + + // New orphan - send SIGTERM + if err := syscall.Kill(orphan.PID, syscall.SIGTERM); err != nil { + if err != syscall.ESRCH { + lastErr = fmt.Errorf("SIGTERM PID %d: %w", orphan.PID, err) + } + continue + } + state[orphan.PID] = signalState{Signal: "SIGTERM", Timestamp: now} + results = append(results, CleanupResult{ + Process: orphan, + Signal: "SIGTERM", + }) + } + + // Save updated state + if err := saveOrphanState(state); err != nil { + if lastErr == nil { + lastErr = fmt.Errorf("saving orphan state: %w", err) + } + } + + return results, lastErr +} diff --git a/internal/util/orphan_test.go b/internal/util/orphan_test.go new file mode 100644 index 00000000..77804724 --- /dev/null +++ b/internal/util/orphan_test.go @@ -0,0 +1,81 @@ +//go:build !windows + +package util + +import ( + "testing" +) + +func TestParseEtime(t *testing.T) { + tests := []struct { + input string + expected int + wantErr bool + }{ + // MM:SS format + {"00:30", 30, false}, + {"01:00", 60, false}, + {"01:23", 83, false}, + {"59:59", 3599, false}, + + // HH:MM:SS format + {"00:01:00", 60, false}, + {"01:00:00", 3600, false}, + {"01:02:03", 3723, false}, + {"23:59:59", 86399, false}, + + // DD-HH:MM:SS format + {"1-00:00:00", 86400, false}, + {"2-01:02:03", 176523, false}, + {"7-12:30:45", 649845, false}, + + // Edge cases + {"00:00", 0, false}, + {"0-00:00:00", 0, false}, + } + + for _, tt := range tests { + t.Run(tt.input, func(t *testing.T) { + got, err := parseEtime(tt.input) + if (err != nil) != tt.wantErr { + t.Errorf("parseEtime(%q) error = %v, wantErr %v", tt.input, err, tt.wantErr) + return + } + if got != tt.expected { + t.Errorf("parseEtime(%q) = %d, want %d", tt.input, got, tt.expected) + } + }) + } +} + +func TestFindOrphanedClaudeProcesses(t *testing.T) { + // This is a live test that checks for orphaned processes on the current system. + // It should not fail - just return whatever orphans exist (likely none in CI). + orphans, err := FindOrphanedClaudeProcesses() + if err != nil { + t.Fatalf("FindOrphanedClaudeProcesses() error = %v", err) + } + + // Log what we found (useful for debugging) + t.Logf("Found %d orphaned claude processes", len(orphans)) + for _, o := range orphans { + t.Logf(" PID %d: %s", o.PID, o.Cmd) + } +} + +func TestFindOrphanedClaudeProcesses_IgnoresTerminalProcesses(t *testing.T) { + // This test verifies that the function only returns processes without TTY. + // We can't easily mock ps output, but we can verify that if we're running + // this test in a terminal, our own process tree isn't flagged. + orphans, err := FindOrphanedClaudeProcesses() + if err != nil { + t.Fatalf("FindOrphanedClaudeProcesses() error = %v", err) + } + + // If we're running in a terminal (typical test scenario), verify that + // any orphans found genuinely have no TTY. We can't verify they're NOT + // in the list since we control the test process, but we can log for inspection. + for _, o := range orphans { + t.Logf("Orphan found: PID %d (%s) - verify this has TTY=? in 'ps aux'", o.PID, o.Cmd) + } +}