feat: Add automatic orphaned claude process cleanup (#588)
* feat: Add automatic orphaned claude process cleanup Claude Code's Task tool spawns subagent processes that sometimes don't clean up properly after completion. These accumulate and consume significant memory (observed: 17 processes using ~6GB RAM). This change adds automatic cleanup in two places: 1. **Deacon patrol** (primary): New patrol step "orphan-process-cleanup" runs `gt deacon cleanup-orphans` early in each cycle. More responsive (~30s). 2. **Daemon heartbeat** (fallback): Runs cleanup every 3 minutes as safety net when deacon is down. Detection uses TTY column - processes with TTY "?" have no controlling terminal. This is safe because: - Processes in terminals (user sessions) have a TTY like "pts/0" - untouched - Only kills processes with no controlling terminal - Orphaned subagents are children of tmux server with no TTY New files: - internal/util/orphan.go: FindOrphanedClaudeProcesses, CleanupOrphanedClaudeProcesses - internal/util/orphan_test.go: Tests for orphan detection New command: - `gt deacon cleanup-orphans`: Manual/patrol-triggered cleanup Fixes #587 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(orphan): add Windows build tag and minimum age check Addresses review feedback on PR #588: 1. Add //go:build !windows to orphan.go and orphan_test.go - The code uses Unix-specific syscalls (SIGTERM, ESRCH) and ps command options that don't exist on Windows 2. Add minimum age check (60 seconds) to prevent false positives - Prevents race conditions with newly spawned subagents - Addresses reviewer concern about cron/systemd processes - Uses portable etime format instead of Linux-only etimes 3. Add parseEtime helper with comprehensive tests - Parses [[DD-]HH:]MM:SS format (works on both Linux and macOS) - etimes (seconds) is Linux-specific, etime is portable Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * fix(orphan): add proper SIGTERM→SIGKILL escalation with state tracking Previous approach used process age which doesn't work: a Task subagent runs without TTY from birth, so a long-running legitimate subagent that later fails to exit would be immediately SIGKILLed without trying SIGTERM. New approach uses a state file to track signal history: 1. First encounter → SIGTERM, record PID + timestamp in state file 2. Next cycle (after 60s grace period) → if still alive, SIGKILL 3. Next cycle → if survived SIGKILL, log as unkillable and remove State file: $XDG_RUNTIME_DIR/gastown-orphan-state (or /tmp/) Format: "<pid> <signal> <unix_timestamp>" per line The state file is automatically cleaned up: - Dead processes removed on load - Unkillable processes removed after logging Also updates callers to use new CleanupResult type which includes the signal sent (SIGTERM, SIGKILL, or UNKILLABLE). Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -22,6 +22,7 @@ import (
|
||||
"github.com/steveyegge/gastown/internal/session"
|
||||
"github.com/steveyegge/gastown/internal/style"
|
||||
"github.com/steveyegge/gastown/internal/tmux"
|
||||
"github.com/steveyegge/gastown/internal/util"
|
||||
"github.com/steveyegge/gastown/internal/workspace"
|
||||
)
|
||||
|
||||
@@ -236,6 +237,27 @@ This removes the pause file and allows the Deacon to work normally.`,
|
||||
RunE: runDeaconResume,
|
||||
}
|
||||
|
||||
var deaconCleanupOrphansCmd = &cobra.Command{
|
||||
Use: "cleanup-orphans",
|
||||
Short: "Clean up orphaned claude subagent processes",
|
||||
Long: `Clean up orphaned claude subagent processes.
|
||||
|
||||
Claude Code's Task tool spawns subagent processes that sometimes don't clean up
|
||||
properly after completion. These accumulate and consume significant memory.
|
||||
|
||||
Detection is based on TTY column: processes with TTY "?" have no controlling
|
||||
terminal. Legitimate claude instances in terminals have a TTY like "pts/0".
|
||||
|
||||
This is safe because:
|
||||
- Processes in terminals (your personal sessions) have a TTY - won't be touched
|
||||
- Only kills processes that have no controlling terminal
|
||||
- These orphans are children of the tmux server with no TTY
|
||||
|
||||
Example:
|
||||
gt deacon cleanup-orphans`,
|
||||
RunE: runDeaconCleanupOrphans,
|
||||
}
|
||||
|
||||
var (
|
||||
triggerTimeout time.Duration
|
||||
|
||||
@@ -270,6 +292,7 @@ func init() {
|
||||
deaconCmd.AddCommand(deaconStaleHooksCmd)
|
||||
deaconCmd.AddCommand(deaconPauseCmd)
|
||||
deaconCmd.AddCommand(deaconResumeCmd)
|
||||
deaconCmd.AddCommand(deaconCleanupOrphansCmd)
|
||||
|
||||
// Flags for trigger-pending
|
||||
deaconTriggerPendingCmd.Flags().DurationVar(&triggerTimeout, "timeout", 2*time.Second,
|
||||
@@ -1105,3 +1128,54 @@ func runDeaconResume(cmd *cobra.Command, args []string) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// runDeaconCleanupOrphans cleans up orphaned claude subagent processes.
|
||||
func runDeaconCleanupOrphans(cmd *cobra.Command, args []string) error {
|
||||
// First, find orphans
|
||||
orphans, err := util.FindOrphanedClaudeProcesses()
|
||||
if err != nil {
|
||||
return fmt.Errorf("finding orphaned processes: %w", err)
|
||||
}
|
||||
|
||||
if len(orphans) == 0 {
|
||||
fmt.Printf("%s No orphaned claude processes found\n", style.Dim.Render("○"))
|
||||
return nil
|
||||
}
|
||||
|
||||
fmt.Printf("%s Found %d orphaned claude process(es)\n", style.Bold.Render("●"), len(orphans))
|
||||
|
||||
// Process them with signal escalation
|
||||
results, err := util.CleanupOrphanedClaudeProcesses()
|
||||
if err != nil {
|
||||
style.PrintWarning("cleanup had errors: %v", err)
|
||||
}
|
||||
|
||||
// Report results
|
||||
var terminated, escalated, unkillable int
|
||||
for _, r := range results {
|
||||
switch r.Signal {
|
||||
case "SIGTERM":
|
||||
fmt.Printf(" %s Sent SIGTERM to PID %d (%s)\n", style.Bold.Render("→"), r.Process.PID, r.Process.Cmd)
|
||||
terminated++
|
||||
case "SIGKILL":
|
||||
fmt.Printf(" %s Escalated to SIGKILL for PID %d (%s)\n", style.Bold.Render("!"), r.Process.PID, r.Process.Cmd)
|
||||
escalated++
|
||||
case "UNKILLABLE":
|
||||
fmt.Printf(" %s WARNING: PID %d (%s) survived SIGKILL\n", style.Bold.Render("⚠"), r.Process.PID, r.Process.Cmd)
|
||||
unkillable++
|
||||
}
|
||||
}
|
||||
|
||||
if len(results) > 0 {
|
||||
summary := fmt.Sprintf("Processed %d orphan(s)", len(results))
|
||||
if escalated > 0 {
|
||||
summary += fmt.Sprintf(" (%d escalated to SIGKILL)", escalated)
|
||||
}
|
||||
if unkillable > 0 {
|
||||
summary += fmt.Sprintf(" (%d unkillable)", unkillable)
|
||||
}
|
||||
fmt.Printf("%s %s\n", style.Bold.Render("✓"), summary)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user