fix(polecat): kill orphaned Claude processes when nuking polecats

When polecats are nuked, Claude child processes could survive and become
orphans, leading to memory exhaustion (observed: 142 orphaned processes
consuming ~56GB RAM).

This commit:
1. Increases the SIGTERM→SIGKILL grace period from 100ms to 2s to give
   processes time to clean up gracefully
2. Adds orphan cleanup to `gt polecat nuke` that runs after session
   termination to catch any processes that escaped
3. Adds a new `gt cleanup` command for manual orphan removal

The orphan detection uses aggressive tmux session verification to find
ALL Claude processes not in any active session, not just those with
PPID=1.

Fixes: gh-736

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
capable
2026-01-22 21:40:49 -08:00
committed by beads/crew/emma
parent ee5221889f
commit bebf425ac5
3 changed files with 183 additions and 11 deletions

127
internal/cmd/cleanup.go Normal file
View File

@@ -0,0 +1,127 @@
package cmd
import (
"fmt"
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/util"
)
var (
cleanupDryRun bool
cleanupForce bool
)
var cleanupCmd = &cobra.Command{
Use: "cleanup",
GroupID: GroupWork,
Short: "Clean up orphaned Claude processes",
Long: `Clean up orphaned Claude processes that survived session termination.
This command finds and kills Claude processes that are not associated with
any active Gas Town tmux session. These orphans can accumulate when:
- Polecat sessions are killed without proper cleanup
- Claude spawns subagent processes that outlive their parent
- Network or system issues interrupt normal shutdown
Uses aggressive tmux session verification to detect ALL orphaned processes,
not just those with PPID=1.
Examples:
gt cleanup # Clean up orphans with confirmation
gt cleanup --dry-run # Show what would be killed
gt cleanup --force # Kill without confirmation`,
RunE: runCleanup,
}
func init() {
cleanupCmd.Flags().BoolVar(&cleanupDryRun, "dry-run", false, "Show what would be killed without killing")
cleanupCmd.Flags().BoolVarP(&cleanupForce, "force", "f", false, "Kill without confirmation")
rootCmd.AddCommand(cleanupCmd)
}
func runCleanup(cmd *cobra.Command, args []string) error {
// Find orphaned processes using aggressive zombie detection
zombies, err := util.FindZombieClaudeProcesses()
if err != nil {
return fmt.Errorf("finding orphaned processes: %w", err)
}
if len(zombies) == 0 {
fmt.Printf("%s No orphaned Claude processes found\n", style.Bold.Render("✓"))
return nil
}
// Show what we found
fmt.Printf("%s Found %d orphaned Claude process(es):\n\n", style.Warning.Render("⚠"), len(zombies))
for _, z := range zombies {
ageStr := formatProcessAgeCleanup(z.Age)
fmt.Printf(" %s %s (age: %s, tty: %s)\n",
style.Bold.Render(fmt.Sprintf("PID %d", z.PID)),
z.Cmd,
style.Dim.Render(ageStr),
z.TTY)
}
fmt.Println()
if cleanupDryRun {
fmt.Printf("%s Dry run - no processes killed\n", style.Dim.Render(""))
return nil
}
// Confirm unless --force
if !cleanupForce {
fmt.Printf("Kill these %d process(es)? [y/N] ", len(zombies))
var response string
_, _ = fmt.Scanln(&response)
if response != "y" && response != "Y" && response != "yes" && response != "Yes" {
fmt.Println("Aborted")
return nil
}
}
// Kill the processes using the standard cleanup function
results, err := util.CleanupZombieClaudeProcesses()
if err != nil {
return fmt.Errorf("cleaning up processes: %w", err)
}
// Report results
var killed, escalated int
for _, r := range results {
switch r.Signal {
case "SIGTERM":
fmt.Printf(" %s PID %d sent SIGTERM\n", style.Success.Render("✓"), r.Process.PID)
killed++
case "SIGKILL":
fmt.Printf(" %s PID %d sent SIGKILL (didn't respond to SIGTERM)\n", style.Warning.Render("⚠"), r.Process.PID)
killed++
case "UNKILLABLE":
fmt.Printf(" %s PID %d survived SIGKILL\n", style.Error.Render("✗"), r.Process.PID)
escalated++
}
}
fmt.Printf("\n%s Cleaned up %d process(es)", style.Bold.Render("✓"), killed)
if escalated > 0 {
fmt.Printf(", %d unkillable", escalated)
}
fmt.Println()
return nil
}
// formatProcessAgeCleanup formats seconds into a human-readable age string
func formatProcessAgeCleanup(seconds int) string {
if seconds < 60 {
return fmt.Sprintf("%ds", seconds)
}
if seconds < 3600 {
return fmt.Sprintf("%dm%ds", seconds/60, seconds%60)
}
hours := seconds / 3600
mins := (seconds % 3600) / 60
return fmt.Sprintf("%dh%dm", hours, mins)
}

View File

@@ -18,6 +18,7 @@ import (
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
)
// Polecat command flags
@@ -1268,6 +1269,12 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
fmt.Printf("\n%s Nuked %d polecat(s).\n", style.SuccessPrefix, nuked)
}
// Final cleanup: Kill any orphaned Claude processes that escaped the session termination.
// This catches processes that called setsid() or were reparented during session shutdown.
if !polecatNukeDryRun {
cleanupOrphanedProcesses()
}
if len(nukeErrors) > 0 {
return fmt.Errorf("%d nuke(s) failed", len(nukeErrors))
}
@@ -1275,6 +1282,39 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
return nil
}
// cleanupOrphanedProcesses kills Claude processes that survived session termination.
// Uses aggressive zombie detection via tmux session verification.
func cleanupOrphanedProcesses() {
results, err := util.CleanupZombieClaudeProcesses()
if err != nil {
// Non-fatal: log and continue
fmt.Printf(" %s orphan cleanup check failed: %v\n", style.Dim.Render("○"), err)
return
}
if len(results) == 0 {
return
}
// Report what was cleaned up
var killed, escalated int
for _, r := range results {
switch r.Signal {
case "SIGTERM", "SIGKILL":
killed++
case "UNKILLABLE":
escalated++
}
}
if killed > 0 {
fmt.Printf(" %s cleaned up %d orphaned process(es)\n", style.Success.Render("✓"), killed)
}
if escalated > 0 {
fmt.Printf(" %s %d process(es) survived SIGKILL (unkillable)\n", style.Warning.Render("⚠"), escalated)
}
}
func runPolecatStale(cmd *cobra.Command, args []string) error {
rigName := args[0]
mgr, r, err := getPolecatManager(rigName)