From bebf425ac5b970bd87bb6bca276fad9e0125aa76 Mon Sep 17 00:00:00 2001 From: capable Date: Thu, 22 Jan 2026 21:40:49 -0800 Subject: [PATCH] fix(polecat): kill orphaned Claude processes when nuking polecats MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When polecats are nuked, Claude child processes could survive and become orphans, leading to memory exhaustion (observed: 142 orphaned processes consuming ~56GB RAM). This commit: 1. Increases the SIGTERM→SIGKILL grace period from 100ms to 2s to give processes time to clean up gracefully 2. Adds orphan cleanup to `gt polecat nuke` that runs after session termination to catch any processes that escaped 3. Adds a new `gt cleanup` command for manual orphan removal The orphan detection uses aggressive tmux session verification to find ALL Claude processes not in any active session, not just those with PPID=1. Fixes: gh-736 Co-Authored-By: Claude Opus 4.5 --- internal/cmd/cleanup.go | 127 ++++++++++++++++++++++++++++++++++++++++ internal/cmd/polecat.go | 40 +++++++++++++ internal/tmux/tmux.go | 27 +++++---- 3 files changed, 183 insertions(+), 11 deletions(-) create mode 100644 internal/cmd/cleanup.go diff --git a/internal/cmd/cleanup.go b/internal/cmd/cleanup.go new file mode 100644 index 00000000..0d5f13a1 --- /dev/null +++ b/internal/cmd/cleanup.go @@ -0,0 +1,127 @@ +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" + "github.com/steveyegge/gastown/internal/style" + "github.com/steveyegge/gastown/internal/util" +) + +var ( + cleanupDryRun bool + cleanupForce bool +) + +var cleanupCmd = &cobra.Command{ + Use: "cleanup", + GroupID: GroupWork, + Short: "Clean up orphaned Claude processes", + Long: `Clean up orphaned Claude processes that survived session termination. + +This command finds and kills Claude processes that are not associated with +any active Gas Town tmux session. These orphans can accumulate when: +- Polecat sessions are killed without proper cleanup +- Claude spawns subagent processes that outlive their parent +- Network or system issues interrupt normal shutdown + +Uses aggressive tmux session verification to detect ALL orphaned processes, +not just those with PPID=1. + +Examples: + gt cleanup # Clean up orphans with confirmation + gt cleanup --dry-run # Show what would be killed + gt cleanup --force # Kill without confirmation`, + RunE: runCleanup, +} + +func init() { + cleanupCmd.Flags().BoolVar(&cleanupDryRun, "dry-run", false, "Show what would be killed without killing") + cleanupCmd.Flags().BoolVarP(&cleanupForce, "force", "f", false, "Kill without confirmation") + + rootCmd.AddCommand(cleanupCmd) +} + +func runCleanup(cmd *cobra.Command, args []string) error { + // Find orphaned processes using aggressive zombie detection + zombies, err := util.FindZombieClaudeProcesses() + if err != nil { + return fmt.Errorf("finding orphaned processes: %w", err) + } + + if len(zombies) == 0 { + fmt.Printf("%s No orphaned Claude processes found\n", style.Bold.Render("✓")) + return nil + } + + // Show what we found + fmt.Printf("%s Found %d orphaned Claude process(es):\n\n", style.Warning.Render("⚠"), len(zombies)) + for _, z := range zombies { + ageStr := formatProcessAgeCleanup(z.Age) + fmt.Printf(" %s %s (age: %s, tty: %s)\n", + style.Bold.Render(fmt.Sprintf("PID %d", z.PID)), + z.Cmd, + style.Dim.Render(ageStr), + z.TTY) + } + fmt.Println() + + if cleanupDryRun { + fmt.Printf("%s Dry run - no processes killed\n", style.Dim.Render("ℹ")) + return nil + } + + // Confirm unless --force + if !cleanupForce { + fmt.Printf("Kill these %d process(es)? [y/N] ", len(zombies)) + var response string + _, _ = fmt.Scanln(&response) + if response != "y" && response != "Y" && response != "yes" && response != "Yes" { + fmt.Println("Aborted") + return nil + } + } + + // Kill the processes using the standard cleanup function + results, err := util.CleanupZombieClaudeProcesses() + if err != nil { + return fmt.Errorf("cleaning up processes: %w", err) + } + + // Report results + var killed, escalated int + for _, r := range results { + switch r.Signal { + case "SIGTERM": + fmt.Printf(" %s PID %d sent SIGTERM\n", style.Success.Render("✓"), r.Process.PID) + killed++ + case "SIGKILL": + fmt.Printf(" %s PID %d sent SIGKILL (didn't respond to SIGTERM)\n", style.Warning.Render("⚠"), r.Process.PID) + killed++ + case "UNKILLABLE": + fmt.Printf(" %s PID %d survived SIGKILL\n", style.Error.Render("✗"), r.Process.PID) + escalated++ + } + } + + fmt.Printf("\n%s Cleaned up %d process(es)", style.Bold.Render("✓"), killed) + if escalated > 0 { + fmt.Printf(", %d unkillable", escalated) + } + fmt.Println() + + return nil +} + +// formatProcessAgeCleanup formats seconds into a human-readable age string +func formatProcessAgeCleanup(seconds int) string { + if seconds < 60 { + return fmt.Sprintf("%ds", seconds) + } + if seconds < 3600 { + return fmt.Sprintf("%dm%ds", seconds/60, seconds%60) + } + hours := seconds / 3600 + mins := (seconds % 3600) / 60 + return fmt.Sprintf("%dh%dm", hours, mins) +} diff --git a/internal/cmd/polecat.go b/internal/cmd/polecat.go index 74b2051a..76c7ddc3 100644 --- a/internal/cmd/polecat.go +++ b/internal/cmd/polecat.go @@ -18,6 +18,7 @@ import ( "github.com/steveyegge/gastown/internal/runtime" "github.com/steveyegge/gastown/internal/style" "github.com/steveyegge/gastown/internal/tmux" + "github.com/steveyegge/gastown/internal/util" ) // Polecat command flags @@ -1268,6 +1269,12 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error { fmt.Printf("\n%s Nuked %d polecat(s).\n", style.SuccessPrefix, nuked) } + // Final cleanup: Kill any orphaned Claude processes that escaped the session termination. + // This catches processes that called setsid() or were reparented during session shutdown. + if !polecatNukeDryRun { + cleanupOrphanedProcesses() + } + if len(nukeErrors) > 0 { return fmt.Errorf("%d nuke(s) failed", len(nukeErrors)) } @@ -1275,6 +1282,39 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error { return nil } +// cleanupOrphanedProcesses kills Claude processes that survived session termination. +// Uses aggressive zombie detection via tmux session verification. +func cleanupOrphanedProcesses() { + results, err := util.CleanupZombieClaudeProcesses() + if err != nil { + // Non-fatal: log and continue + fmt.Printf(" %s orphan cleanup check failed: %v\n", style.Dim.Render("○"), err) + return + } + + if len(results) == 0 { + return + } + + // Report what was cleaned up + var killed, escalated int + for _, r := range results { + switch r.Signal { + case "SIGTERM", "SIGKILL": + killed++ + case "UNKILLABLE": + escalated++ + } + } + + if killed > 0 { + fmt.Printf(" %s cleaned up %d orphaned process(es)\n", style.Success.Render("✓"), killed) + } + if escalated > 0 { + fmt.Printf(" %s %d process(es) survived SIGKILL (unkillable)\n", style.Warning.Render("⚠"), escalated) + } +} + func runPolecatStale(cmd *cobra.Command, args []string) error { rigName := args[0] mgr, r, err := getPolecatManager(rigName) diff --git a/internal/tmux/tmux.go b/internal/tmux/tmux.go index 726ea312..b07447cc 100644 --- a/internal/tmux/tmux.go +++ b/internal/tmux/tmux.go @@ -147,6 +147,11 @@ func (t *Tmux) KillSession(name string) error { return err } +// processKillGracePeriod is how long to wait after SIGTERM before sending SIGKILL. +// 2 seconds gives processes time to clean up gracefully. The previous 100ms was too short +// and caused Claude processes to become orphans when they couldn't shut down in time. +const processKillGracePeriod = 2 * time.Second + // KillSessionWithProcesses explicitly kills all processes in a session before terminating it. // This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored. // @@ -154,7 +159,7 @@ func (t *Tmux) KillSession(name string) error { // 1. Get the pane's main process PID // 2. Find all descendant processes recursively (not just direct children) // 3. Send SIGTERM to all descendants (deepest first) -// 4. Wait 100ms for graceful shutdown +// 4. Wait 2s for graceful shutdown // 5. Send SIGKILL to any remaining descendants // 6. Kill the tmux session // @@ -176,8 +181,8 @@ func (t *Tmux) KillSessionWithProcesses(name string) error { _ = exec.Command("kill", "-TERM", dpid).Run() } - // Wait for graceful shutdown - time.Sleep(100 * time.Millisecond) + // Wait for graceful shutdown (2s gives processes time to clean up) + time.Sleep(processKillGracePeriod) // Send SIGKILL to any remaining descendants for _, dpid := range descendants { @@ -186,7 +191,7 @@ func (t *Tmux) KillSessionWithProcesses(name string) error { // Kill the pane process itself (may have called setsid() and detached) _ = exec.Command("kill", "-TERM", pid).Run() - time.Sleep(100 * time.Millisecond) + time.Sleep(processKillGracePeriod) _ = exec.Command("kill", "-KILL", pid).Run() } @@ -235,8 +240,8 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri _ = exec.Command("kill", "-TERM", dpid).Run() } - // Wait for graceful shutdown - time.Sleep(100 * time.Millisecond) + // Wait for graceful shutdown (2s gives processes time to clean up) + time.Sleep(processKillGracePeriod) // Send SIGKILL to any remaining non-excluded descendants for _, dpid := range filtered { @@ -247,7 +252,7 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri // Only if not excluded if !exclude[pid] { _ = exec.Command("kill", "-TERM", pid).Run() - time.Sleep(100 * time.Millisecond) + time.Sleep(processKillGracePeriod) _ = exec.Command("kill", "-KILL", pid).Run() } } @@ -291,7 +296,7 @@ func getAllDescendants(pid string) []string { // 1. Get the pane's main process PID // 2. Find all descendant processes recursively (not just direct children) // 3. Send SIGTERM to all descendants (deepest first) -// 4. Wait 100ms for graceful shutdown +// 4. Wait 2s for graceful shutdown // 5. Send SIGKILL to any remaining descendants // 6. Kill the pane process itself // @@ -316,8 +321,8 @@ func (t *Tmux) KillPaneProcesses(pane string) error { _ = exec.Command("kill", "-TERM", dpid).Run() } - // Wait for graceful shutdown - time.Sleep(100 * time.Millisecond) + // Wait for graceful shutdown (2s gives processes time to clean up) + time.Sleep(processKillGracePeriod) // Send SIGKILL to any remaining descendants for _, dpid := range descendants { @@ -327,7 +332,7 @@ func (t *Tmux) KillPaneProcesses(pane string) error { // Kill the pane process itself (may have called setsid() and detached, // or may have no children like Claude Code) _ = exec.Command("kill", "-TERM", pid).Run() - time.Sleep(100 * time.Millisecond) + time.Sleep(processKillGracePeriod) _ = exec.Command("kill", "-KILL", pid).Run() return nil