fix(polecat): kill orphaned Claude processes when nuking polecats

When polecats are nuked, Claude child processes could survive and become
orphans, leading to memory exhaustion (observed: 142 orphaned processes
consuming ~56GB RAM).

This commit:
1. Increases the SIGTERM→SIGKILL grace period from 100ms to 2s to give
   processes time to clean up gracefully
2. Adds orphan cleanup to `gt polecat nuke` that runs after session
   termination to catch any processes that escaped
3. Adds a new `gt cleanup` command for manual orphan removal

The orphan detection uses aggressive tmux session verification to find
ALL Claude processes not in any active session, not just those with
PPID=1.

Fixes: gh-736

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
capable
2026-01-22 21:40:49 -08:00
committed by beads/crew/emma
parent ee5221889f
commit bebf425ac5
3 changed files with 183 additions and 11 deletions

View File

@@ -18,6 +18,7 @@ import (
"github.com/steveyegge/gastown/internal/runtime"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
)
// Polecat command flags
@@ -1268,6 +1269,12 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
fmt.Printf("\n%s Nuked %d polecat(s).\n", style.SuccessPrefix, nuked)
}
// Final cleanup: Kill any orphaned Claude processes that escaped the session termination.
// This catches processes that called setsid() or were reparented during session shutdown.
if !polecatNukeDryRun {
cleanupOrphanedProcesses()
}
if len(nukeErrors) > 0 {
return fmt.Errorf("%d nuke(s) failed", len(nukeErrors))
}
@@ -1275,6 +1282,39 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
return nil
}
// cleanupOrphanedProcesses kills Claude processes that survived session termination.
// Uses aggressive zombie detection via tmux session verification.
func cleanupOrphanedProcesses() {
results, err := util.CleanupZombieClaudeProcesses()
if err != nil {
// Non-fatal: log and continue
fmt.Printf(" %s orphan cleanup check failed: %v\n", style.Dim.Render("○"), err)
return
}
if len(results) == 0 {
return
}
// Report what was cleaned up
var killed, escalated int
for _, r := range results {
switch r.Signal {
case "SIGTERM", "SIGKILL":
killed++
case "UNKILLABLE":
escalated++
}
}
if killed > 0 {
fmt.Printf(" %s cleaned up %d orphaned process(es)\n", style.Success.Render("✓"), killed)
}
if escalated > 0 {
fmt.Printf(" %s %d process(es) survived SIGKILL (unkillable)\n", style.Warning.Render("⚠"), escalated)
}
}
func runPolecatStale(cmd *cobra.Command, args []string) error {
rigName := args[0]
mgr, r, err := getPolecatManager(rigName)