fix(polecat): kill orphaned Claude processes when nuking polecats
When polecats are nuked, Claude child processes could survive and become orphans, leading to memory exhaustion (observed: 142 orphaned processes consuming ~56GB RAM). This commit: 1. Increases the SIGTERM→SIGKILL grace period from 100ms to 2s to give processes time to clean up gracefully 2. Adds orphan cleanup to `gt polecat nuke` that runs after session termination to catch any processes that escaped 3. Adds a new `gt cleanup` command for manual orphan removal The orphan detection uses aggressive tmux session verification to find ALL Claude processes not in any active session, not just those with PPID=1. Fixes: gh-736 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
127
internal/cmd/cleanup.go
Normal file
127
internal/cmd/cleanup.go
Normal file
@@ -0,0 +1,127 @@
|
||||
package cmd
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/spf13/cobra"
|
||||
"github.com/steveyegge/gastown/internal/style"
|
||||
"github.com/steveyegge/gastown/internal/util"
|
||||
)
|
||||
|
||||
var (
|
||||
cleanupDryRun bool
|
||||
cleanupForce bool
|
||||
)
|
||||
|
||||
var cleanupCmd = &cobra.Command{
|
||||
Use: "cleanup",
|
||||
GroupID: GroupWork,
|
||||
Short: "Clean up orphaned Claude processes",
|
||||
Long: `Clean up orphaned Claude processes that survived session termination.
|
||||
|
||||
This command finds and kills Claude processes that are not associated with
|
||||
any active Gas Town tmux session. These orphans can accumulate when:
|
||||
- Polecat sessions are killed without proper cleanup
|
||||
- Claude spawns subagent processes that outlive their parent
|
||||
- Network or system issues interrupt normal shutdown
|
||||
|
||||
Uses aggressive tmux session verification to detect ALL orphaned processes,
|
||||
not just those with PPID=1.
|
||||
|
||||
Examples:
|
||||
gt cleanup # Clean up orphans with confirmation
|
||||
gt cleanup --dry-run # Show what would be killed
|
||||
gt cleanup --force # Kill without confirmation`,
|
||||
RunE: runCleanup,
|
||||
}
|
||||
|
||||
func init() {
|
||||
cleanupCmd.Flags().BoolVar(&cleanupDryRun, "dry-run", false, "Show what would be killed without killing")
|
||||
cleanupCmd.Flags().BoolVarP(&cleanupForce, "force", "f", false, "Kill without confirmation")
|
||||
|
||||
rootCmd.AddCommand(cleanupCmd)
|
||||
}
|
||||
|
||||
func runCleanup(cmd *cobra.Command, args []string) error {
|
||||
// Find orphaned processes using aggressive zombie detection
|
||||
zombies, err := util.FindZombieClaudeProcesses()
|
||||
if err != nil {
|
||||
return fmt.Errorf("finding orphaned processes: %w", err)
|
||||
}
|
||||
|
||||
if len(zombies) == 0 {
|
||||
fmt.Printf("%s No orphaned Claude processes found\n", style.Bold.Render("✓"))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Show what we found
|
||||
fmt.Printf("%s Found %d orphaned Claude process(es):\n\n", style.Warning.Render("⚠"), len(zombies))
|
||||
for _, z := range zombies {
|
||||
ageStr := formatProcessAgeCleanup(z.Age)
|
||||
fmt.Printf(" %s %s (age: %s, tty: %s)\n",
|
||||
style.Bold.Render(fmt.Sprintf("PID %d", z.PID)),
|
||||
z.Cmd,
|
||||
style.Dim.Render(ageStr),
|
||||
z.TTY)
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
if cleanupDryRun {
|
||||
fmt.Printf("%s Dry run - no processes killed\n", style.Dim.Render("ℹ"))
|
||||
return nil
|
||||
}
|
||||
|
||||
// Confirm unless --force
|
||||
if !cleanupForce {
|
||||
fmt.Printf("Kill these %d process(es)? [y/N] ", len(zombies))
|
||||
var response string
|
||||
_, _ = fmt.Scanln(&response)
|
||||
if response != "y" && response != "Y" && response != "yes" && response != "Yes" {
|
||||
fmt.Println("Aborted")
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Kill the processes using the standard cleanup function
|
||||
results, err := util.CleanupZombieClaudeProcesses()
|
||||
if err != nil {
|
||||
return fmt.Errorf("cleaning up processes: %w", err)
|
||||
}
|
||||
|
||||
// Report results
|
||||
var killed, escalated int
|
||||
for _, r := range results {
|
||||
switch r.Signal {
|
||||
case "SIGTERM":
|
||||
fmt.Printf(" %s PID %d sent SIGTERM\n", style.Success.Render("✓"), r.Process.PID)
|
||||
killed++
|
||||
case "SIGKILL":
|
||||
fmt.Printf(" %s PID %d sent SIGKILL (didn't respond to SIGTERM)\n", style.Warning.Render("⚠"), r.Process.PID)
|
||||
killed++
|
||||
case "UNKILLABLE":
|
||||
fmt.Printf(" %s PID %d survived SIGKILL\n", style.Error.Render("✗"), r.Process.PID)
|
||||
escalated++
|
||||
}
|
||||
}
|
||||
|
||||
fmt.Printf("\n%s Cleaned up %d process(es)", style.Bold.Render("✓"), killed)
|
||||
if escalated > 0 {
|
||||
fmt.Printf(", %d unkillable", escalated)
|
||||
}
|
||||
fmt.Println()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// formatProcessAgeCleanup formats seconds into a human-readable age string
|
||||
func formatProcessAgeCleanup(seconds int) string {
|
||||
if seconds < 60 {
|
||||
return fmt.Sprintf("%ds", seconds)
|
||||
}
|
||||
if seconds < 3600 {
|
||||
return fmt.Sprintf("%dm%ds", seconds/60, seconds%60)
|
||||
}
|
||||
hours := seconds / 3600
|
||||
mins := (seconds % 3600) / 60
|
||||
return fmt.Sprintf("%dh%dm", hours, mins)
|
||||
}
|
||||
@@ -18,6 +18,7 @@ import (
|
||||
"github.com/steveyegge/gastown/internal/runtime"
|
||||
"github.com/steveyegge/gastown/internal/style"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
"github.com/steveyegge/gastown/internal/util"
|
||||
)
|
||||
|
||||
// Polecat command flags
|
||||
@@ -1268,6 +1269,12 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
|
||||
fmt.Printf("\n%s Nuked %d polecat(s).\n", style.SuccessPrefix, nuked)
|
||||
}
|
||||
|
||||
// Final cleanup: Kill any orphaned Claude processes that escaped the session termination.
|
||||
// This catches processes that called setsid() or were reparented during session shutdown.
|
||||
if !polecatNukeDryRun {
|
||||
cleanupOrphanedProcesses()
|
||||
}
|
||||
|
||||
if len(nukeErrors) > 0 {
|
||||
return fmt.Errorf("%d nuke(s) failed", len(nukeErrors))
|
||||
}
|
||||
@@ -1275,6 +1282,39 @@ func runPolecatNuke(cmd *cobra.Command, args []string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// cleanupOrphanedProcesses kills Claude processes that survived session termination.
|
||||
// Uses aggressive zombie detection via tmux session verification.
|
||||
func cleanupOrphanedProcesses() {
|
||||
results, err := util.CleanupZombieClaudeProcesses()
|
||||
if err != nil {
|
||||
// Non-fatal: log and continue
|
||||
fmt.Printf(" %s orphan cleanup check failed: %v\n", style.Dim.Render("○"), err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(results) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// Report what was cleaned up
|
||||
var killed, escalated int
|
||||
for _, r := range results {
|
||||
switch r.Signal {
|
||||
case "SIGTERM", "SIGKILL":
|
||||
killed++
|
||||
case "UNKILLABLE":
|
||||
escalated++
|
||||
}
|
||||
}
|
||||
|
||||
if killed > 0 {
|
||||
fmt.Printf(" %s cleaned up %d orphaned process(es)\n", style.Success.Render("✓"), killed)
|
||||
}
|
||||
if escalated > 0 {
|
||||
fmt.Printf(" %s %d process(es) survived SIGKILL (unkillable)\n", style.Warning.Render("⚠"), escalated)
|
||||
}
|
||||
}
|
||||
|
||||
func runPolecatStale(cmd *cobra.Command, args []string) error {
|
||||
rigName := args[0]
|
||||
mgr, r, err := getPolecatManager(rigName)
|
||||
|
||||
@@ -147,6 +147,11 @@ func (t *Tmux) KillSession(name string) error {
|
||||
return err
|
||||
}
|
||||
|
||||
// processKillGracePeriod is how long to wait after SIGTERM before sending SIGKILL.
|
||||
// 2 seconds gives processes time to clean up gracefully. The previous 100ms was too short
|
||||
// and caused Claude processes to become orphans when they couldn't shut down in time.
|
||||
const processKillGracePeriod = 2 * time.Second
|
||||
|
||||
// KillSessionWithProcesses explicitly kills all processes in a session before terminating it.
|
||||
// This prevents orphan processes that survive tmux kill-session due to SIGHUP being ignored.
|
||||
//
|
||||
@@ -154,7 +159,7 @@ func (t *Tmux) KillSession(name string) error {
|
||||
// 1. Get the pane's main process PID
|
||||
// 2. Find all descendant processes recursively (not just direct children)
|
||||
// 3. Send SIGTERM to all descendants (deepest first)
|
||||
// 4. Wait 100ms for graceful shutdown
|
||||
// 4. Wait 2s for graceful shutdown
|
||||
// 5. Send SIGKILL to any remaining descendants
|
||||
// 6. Kill the tmux session
|
||||
//
|
||||
@@ -176,8 +181,8 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
|
||||
_ = exec.Command("kill", "-TERM", dpid).Run()
|
||||
}
|
||||
|
||||
// Wait for graceful shutdown
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
// Wait for graceful shutdown (2s gives processes time to clean up)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
|
||||
// Send SIGKILL to any remaining descendants
|
||||
for _, dpid := range descendants {
|
||||
@@ -186,7 +191,7 @@ func (t *Tmux) KillSessionWithProcesses(name string) error {
|
||||
|
||||
// Kill the pane process itself (may have called setsid() and detached)
|
||||
_ = exec.Command("kill", "-TERM", pid).Run()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
_ = exec.Command("kill", "-KILL", pid).Run()
|
||||
}
|
||||
|
||||
@@ -235,8 +240,8 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
|
||||
_ = exec.Command("kill", "-TERM", dpid).Run()
|
||||
}
|
||||
|
||||
// Wait for graceful shutdown
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
// Wait for graceful shutdown (2s gives processes time to clean up)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
|
||||
// Send SIGKILL to any remaining non-excluded descendants
|
||||
for _, dpid := range filtered {
|
||||
@@ -247,7 +252,7 @@ func (t *Tmux) KillSessionWithProcessesExcluding(name string, excludePIDs []stri
|
||||
// Only if not excluded
|
||||
if !exclude[pid] {
|
||||
_ = exec.Command("kill", "-TERM", pid).Run()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
_ = exec.Command("kill", "-KILL", pid).Run()
|
||||
}
|
||||
}
|
||||
@@ -291,7 +296,7 @@ func getAllDescendants(pid string) []string {
|
||||
// 1. Get the pane's main process PID
|
||||
// 2. Find all descendant processes recursively (not just direct children)
|
||||
// 3. Send SIGTERM to all descendants (deepest first)
|
||||
// 4. Wait 100ms for graceful shutdown
|
||||
// 4. Wait 2s for graceful shutdown
|
||||
// 5. Send SIGKILL to any remaining descendants
|
||||
// 6. Kill the pane process itself
|
||||
//
|
||||
@@ -316,8 +321,8 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
|
||||
_ = exec.Command("kill", "-TERM", dpid).Run()
|
||||
}
|
||||
|
||||
// Wait for graceful shutdown
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
// Wait for graceful shutdown (2s gives processes time to clean up)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
|
||||
// Send SIGKILL to any remaining descendants
|
||||
for _, dpid := range descendants {
|
||||
@@ -327,7 +332,7 @@ func (t *Tmux) KillPaneProcesses(pane string) error {
|
||||
// Kill the pane process itself (may have called setsid() and detached,
|
||||
// or may have no children like Claude Code)
|
||||
_ = exec.Command("kill", "-TERM", pid).Run()
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
time.Sleep(processKillGracePeriod)
|
||||
_ = exec.Command("kill", "-KILL", pid).Run()
|
||||
|
||||
return nil
|
||||
|
||||
Reference in New Issue
Block a user