fix(polecat): kill orphaned Claude processes when nuking polecats

When polecats are nuked, Claude child processes could survive and become
orphans, leading to memory exhaustion (observed: 142 orphaned processes
consuming ~56GB RAM).

This commit:
1. Increases the SIGTERM→SIGKILL grace period from 100ms to 2s to give
   processes time to clean up gracefully
2. Adds orphan cleanup to `gt polecat nuke` that runs after session
   termination to catch any processes that escaped
3. Adds a new `gt cleanup` command for manual orphan removal

The orphan detection uses aggressive tmux session verification to find
ALL Claude processes not in any active session, not just those with
PPID=1.

Fixes: gh-736

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
capable
2026-01-22 21:40:49 -08:00
committed by beads/crew/emma
parent ee5221889f
commit bebf425ac5
3 changed files with 183 additions and 11 deletions

127
internal/cmd/cleanup.go Normal file
View File

@@ -0,0 +1,127 @@
package cmd
import (
"fmt"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/util"
)
var (
cleanupDryRun bool
cleanupForce bool
)
var cleanupCmd = &cobra.Command{
Use: "cleanup",
GroupID: GroupWork,
Short: "Clean up orphaned Claude processes",
Long: `Clean up orphaned Claude processes that survived session termination.
This command finds and kills Claude processes that are not associated with
any active Gas Town tmux session. These orphans can accumulate when:
- Polecat sessions are killed without proper cleanup
- Claude spawns subagent processes that outlive their parent
- Network or system issues interrupt normal shutdown
Uses aggressive tmux session verification to detect ALL orphaned processes,
not just those with PPID=1.
Examples:
gt cleanup # Clean up orphans with confirmation
gt cleanup --dry-run # Show what would be killed
gt cleanup --force # Kill without confirmation`,
RunE: runCleanup,
}
func init() {
cleanupCmd.Flags().BoolVar(&cleanupDryRun, "dry-run", false, "Show what would be killed without killing")
cleanupCmd.Flags().BoolVarP(&cleanupForce, "force", "f", false, "Kill without confirmation")
rootCmd.AddCommand(cleanupCmd)
}
func runCleanup(cmd *cobra.Command, args []string) error {
// Find orphaned processes using aggressive zombie detection
zombies, err := util.FindZombieClaudeProcesses()
if err != nil {
return fmt.Errorf("finding orphaned processes: %w", err)
}
if len(zombies) == 0 {
fmt.Printf("%s No orphaned Claude processes found\n", style.Bold.Render("✓"))
return nil
}
// Show what we found
fmt.Printf("%s Found %d orphaned Claude process(es):\n\n", style.Warning.Render("⚠"), len(zombies))
for _, z := range zombies {
ageStr := formatProcessAgeCleanup(z.Age)
fmt.Printf(" %s %s (age: %s, tty: %s)\n",
style.Bold.Render(fmt.Sprintf("PID %d", z.PID)),
z.Cmd,
style.Dim.Render(ageStr),
z.TTY)
}
fmt.Println()
if cleanupDryRun {
fmt.Printf("%s Dry run - no processes killed\n", style.Dim.Render(""))
return nil
}
// Confirm unless --force
if !cleanupForce {
fmt.Printf("Kill these %d process(es)? [y/N] ", len(zombies))
var response string
_, _ = fmt.Scanln(&response)
if response != "y" && response != "Y" && response != "yes" && response != "Yes" {
fmt.Println("Aborted")
return nil
}
}
// Kill the processes using the standard cleanup function
results, err := util.CleanupZombieClaudeProcesses()
if err != nil {
return fmt.Errorf("cleaning up processes: %w", err)
}
// Report results
var killed, escalated int
for _, r := range results {
switch r.Signal {
case "SIGTERM":
fmt.Printf(" %s PID %d sent SIGTERM\n", style.Success.Render("✓"), r.Process.PID)
killed++
case "SIGKILL":
fmt.Printf(" %s PID %d sent SIGKILL (didn't respond to SIGTERM)\n", style.Warning.Render("⚠"), r.Process.PID)
killed++
case "UNKILLABLE":
fmt.Printf(" %s PID %d survived SIGKILL\n", style.Error.Render("✗"), r.Process.PID)
escalated++
}
}
fmt.Printf("\n%s Cleaned up %d process(es)", style.Bold.Render("✓"), killed)
if escalated > 0 {
fmt.Printf(", %d unkillable", escalated)
}
fmt.Println()
return nil
}
// formatProcessAgeCleanup formats seconds into a human-readable age string
func formatProcessAgeCleanup(seconds int) string {
if seconds < 60 {
return fmt.Sprintf("%ds", seconds)
}
if seconds < 3600 {
return fmt.Sprintf("%dm%ds", seconds/60, seconds%60)
}
hours := seconds / 3600
mins := (seconds % 3600) / 60
return fmt.Sprintf("%dh%dm", hours, mins)
}

View File

@@ -18,6 +18,7 @@ import (
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
)
// Polecat command flags
@@ -1268,6 +1269,12 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
fmt.Printf("\n%s Nuked %d polecat(s).\n", style.SuccessPrefix, nuked)
}
// Final cleanup: Kill any orphaned Claude processes that escaped the session termination.
// This catches processes that called setsid() or were reparented during session shutdown.
if !polecatNukeDryRun {
cleanupOrphanedProcesses()
}
if len(nukeErrors) > 0 {
return fmt.Errorf("%d nuke(s) failed", len(nukeErrors))
}
@@ -1275,6 +1282,39 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
return nil
}
// cleanupOrphanedProcesses kills Claude processes that survived session termination.
// Uses aggressive zombie detection via tmux session verification.
func cleanupOrphanedProcesses() {
results, err := util.CleanupZombieClaudeProcesses()
if err != nil {
// Non-fatal: log and continue
fmt.Printf(" %s orphan cleanup check failed: %v\n", style.Dim.Render("○"), err)
return
}
if len(results) == 0 {
return
}
// Report what was cleaned up
var killed, escalated int
for _, r := range results {
switch r.Signal {
case "SIGTERM", "SIGKILL":
killed++
case "UNKILLABLE":
escalated++
}
}
if killed > 0 {
fmt.Printf(" %s cleaned up %d orphaned process(es)\n", style.Success.Render("✓"), killed)
}
if escalated > 0 {
fmt.Printf(" %s %d process(es) survived SIGKILL (unkillable)\n", style.Warning.Render("⚠"), escalated)
}
}
func runPolecatStale(cmd *cobra.Command, args []string) error {
rigName := args[0]
mgr, r, err := getPolecatManager(rigName)

View File

@@ -147,6 +147,11 @@ func (t *Tmux) KillSession(name string) error {
return err
}
// processKillGracePeriod is how long to wait after SIGTERM before sending SIGKILL.
// 2 seconds gives processes time to clean up gracefully. The previous 100ms was too short
// and caused Claude processes to become orphans when they couldn't shut down in time.
const processKillGracePeriod = 2 * time.Second
// KillSessionWithProcesses explicitly kills all processes in a session before terminating it.
// This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored.
//
@@ -154,7 +159,7 @@ func (t *Tmux) KillSession(name string) error {
// 1. Get the pane's main process PID
// 2. Find all descendant processes recursively (not just direct children)
// 3. Send SIGTERM to all descendants (deepest first)
// 4. Wait 100ms for graceful shutdown
// 4. Wait 2s for graceful shutdown
// 5. Send SIGKILL to any remaining descendants
// 6. Kill the tmux session
//
@@ -176,8 +181,8 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
_ = exec.Command("kill", "-TERM", dpid).Run()
}
// Wait for graceful shutdown
time.Sleep(100 * time.Millisecond)
// Wait for graceful shutdown (2s gives processes time to clean up)
time.Sleep(processKillGracePeriod)
// Send SIGKILL to any remaining descendants
for _, dpid := range descendants {
@@ -186,7 +191,7 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
// Kill the pane process itself (may have called setsid() and detached)
_ = exec.Command("kill", "-TERM", pid).Run()
time.Sleep(100 * time.Millisecond)
time.Sleep(processKillGracePeriod)
_ = exec.Command("kill", "-KILL", pid).Run()
}
@@ -235,8 +240,8 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
_ = exec.Command("kill", "-TERM", dpid).Run()
}
// Wait for graceful shutdown
time.Sleep(100 * time.Millisecond)
// Wait for graceful shutdown (2s gives processes time to clean up)
time.Sleep(processKillGracePeriod)
// Send SIGKILL to any remaining non-excluded descendants
for _, dpid := range filtered {
@@ -247,7 +252,7 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
// Only if not excluded
if !exclude[pid] {
_ = exec.Command("kill", "-TERM", pid).Run()
time.Sleep(100 * time.Millisecond)
time.Sleep(processKillGracePeriod)
_ = exec.Command("kill", "-KILL", pid).Run()
}
}
@@ -291,7 +296,7 @@ func getAllDescendants(pid string) []string {
// 1. Get the pane's main process PID
// 2. Find all descendant processes recursively (not just direct children)
// 3. Send SIGTERM to all descendants (deepest first)
// 4. Wait 100ms for graceful shutdown
// 4. Wait 2s for graceful shutdown
// 5. Send SIGKILL to any remaining descendants
// 6. Kill the pane process itself
//
@@ -316,8 +321,8 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
_ = exec.Command("kill", "-TERM", dpid).Run()
}
// Wait for graceful shutdown
time.Sleep(100 * time.Millisecond)
// Wait for graceful shutdown (2s gives processes time to clean up)
time.Sleep(processKillGracePeriod)
// Send SIGKILL to any remaining descendants
for _, dpid := range descendants {
@@ -327,7 +332,7 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
// Kill the pane process itself (may have called setsid() and detached,
// or may have no children like Claude Code)
_ = exec.Command("kill", "-TERM", pid).Run()
time.Sleep(100 * time.Millisecond)
time.Sleep(processKillGracePeriod)
_ = exec.Command("kill", "-KILL", pid).Run()
return nil