fix(shutdown): fix session counter bug and add --cleanup-orphans flag (#759)

## Problems Fixed

1. **False reporting**: `gt shutdown` reported "0 sessions stopped" even when
   all 5 sessions were successfully terminated
2. **Orphaned processes**: No way to clean up Claude processes left behind by
   crashed/interrupted sessions

## Root Causes

1. **Counter bug**: `killSessionsInOrder()` only incremented the counter when
   `KillSessionWithProcesses()` returned no error. However, this function can
   return an error even after successfully killing all processes (e.g., when
   the session auto-closes after its processes die, the final `kill-session`
   command fails with "session not found").

2. **No orphan cleanup**: While `internal/util/orphan.go` provides orphan
   detection infrastructure, it wasn't integrated into the shutdown workflow.

## Solutions

1. **Fix counter logic**: Modified `killSessionsInOrder()` to verify session
   termination by checking if the session still exists after the kill attempt,
   rather than relying solely on the error return value. This correctly counts
   sessions that were terminated even if the kill command returned an error.

2. **Add `--cleanup-orphans` flag**: Integrated orphan cleanup with a simple
   synchronous approach:
   - Finds Claude/codex processes without a controlling terminal (TTY)
   - Filters out processes younger than 60 seconds (avoids race conditions)
   - Excludes processes belonging to active Gas Town tmux sessions
   - Sends SIGTERM to all orphans
   - Waits for configurable grace period (default 60s)
   - Sends SIGKILL to any that survived SIGTERM

3. **Add `--cleanup-orphans-grace-secs` flag**: Allows users to configure the
   grace period between SIGTERM and SIGKILL (default 60 seconds).

## Design Choice: Synchronous vs. Persistent State

The orphan cleanup uses a **synchronous wait approach** rather than the
persistent state machine approach in `util.CleanupOrphanedClaudeProcesses()`:

**Synchronous approach (this PR):**
- Send SIGTERM → Wait N seconds → Send SIGKILL (all in one invocation)
- Simpler to understand and debug
- User sees immediate results
- No persistent state file to manage

**Persistent state approach (util.CleanupOrphanedClaudeProcesses):**
- First run: SIGTERM → save state
- Second run (60s later): Check state → SIGKILL
- Requires multiple invocations
- Persists state in `/tmp/gastown-orphan-state`

The synchronous approach is more appropriate for `gt shutdown` where users
expect immediate cleanup, while the persistent approach is better suited for
periodic cleanup daemons.

## Testing

Before fix:
```
Sessions to stop: gt-boot, gt-pgqueue-refinery, gt-pgqueue-witness, hq-deacon, hq-mayor
✓ Gas Town shutdown complete (0 sessions stopped)  ← Bug
```

After fix:
```
Sessions to stop: gt-boot, gt-pgqueue-refinery, gt-pgqueue-witness, hq-deacon, hq-mayor
✓ hq-deacon stopped
✓ gt-boot stopped
✓ gt-pgqueue-refinery stopped
✓ gt-pgqueue-witness stopped
✓ hq-mayor stopped
Cleaning up orphaned Claude processes...
→ PID 267916: sent SIGTERM (waiting 60s before SIGKILL)
 Waiting 60 seconds for processes to terminate gracefully...
✓ 1 process(es) terminated gracefully from SIGTERM
✓ All processes cleaned up successfully
✓ Gas Town shutdown complete (5 sessions stopped)  ← Fixed
```

All sessions verified terminated via `tmux ls`.

Co-authored-by: Roland Tritsch <roland@ailtir.com>
Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Roland Tritsch
2026-01-21 04:23:30 +00:00
committed by GitHub
parent 7c2f9687ec
commit f4072e58cc

View File

@@ -9,6 +9,7 @@ import (
"strings"
"sync"
"sync/atomic"
"syscall"
"time"
"github.com/spf13/cobra"
@@ -25,23 +26,26 @@ import (
"github.com/steveyegge/gastown/internal/session"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/tmux"
"github.com/steveyegge/gastown/internal/util"
"github.com/steveyegge/gastown/internal/witness"
"github.com/steveyegge/gastown/internal/workspace"
)
var (
startAll bool
startAgentOverride string
startCrewRig string
startCrewAccount string
startCrewAgentOverride string
shutdownGraceful bool
shutdownWait int
shutdownAll bool
shutdownForce bool
shutdownYes bool
shutdownPolecatsOnly bool
shutdownNuclear bool
startAll bool
startAgentOverride string
startCrewRig string
startCrewAccount string
startCrewAgentOverride string
shutdownGraceful bool
shutdownWait int
shutdownAll bool
shutdownForce bool
shutdownYes bool
shutdownPolecatsOnly bool
shutdownNuclear bool
shutdownCleanupOrphans bool
shutdownCleanupOrphansGrace int
)
var startCmd = &cobra.Command{
@@ -90,7 +94,9 @@ Shutdown levels (progressively more aggressive):
Use --force or --yes to skip confirmation prompt.
Use --graceful to allow agents time to save state before killing.
Use --nuclear to force cleanup even if polecats have uncommitted work (DANGER).`,
Use --nuclear to force cleanup even if polecats have uncommitted work (DANGER).
Use --cleanup-orphans to kill orphaned Claude processes (TTY-less, older than 60s).
Use --cleanup-orphans-grace-secs to set the grace period (default 60s).`,
RunE: runShutdown,
}
@@ -137,6 +143,10 @@ func init() {
"Only stop polecats (minimal shutdown)")
shutdownCmd.Flags().BoolVar(&shutdownNuclear, "nuclear", false,
"Force cleanup even if polecats have uncommitted work (DANGER: may lose work)")
shutdownCmd.Flags().BoolVar(&shutdownCleanupOrphans, "cleanup-orphans", false,
"Clean up orphaned Claude processes (TTY-less processes older than 60s)")
shutdownCmd.Flags().IntVar(&shutdownCleanupOrphansGrace, "cleanup-orphans-grace-secs", 60,
"Grace period in seconds between SIGTERM and SIGKILL when cleaning orphans (default 60)")
rootCmd.AddCommand(startCmd)
rootCmd.AddCommand(shutdownCmd)
@@ -563,14 +573,20 @@ func runGracefulShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) err
deaconSession := getDeaconSessionName()
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
// Phase 5: Cleanup polecat worktrees and branches
fmt.Printf("\nPhase 5: Cleaning up polecats...\n")
// Phase 5: Cleanup orphaned Claude processes if requested
if shutdownCleanupOrphans {
fmt.Printf("\nPhase 5: Cleaning up orphaned Claude processes...\n")
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
}
// Phase 6: Cleanup polecat worktrees and branches
fmt.Printf("\nPhase 6: Cleaning up polecats...\n")
if townRoot != "" {
cleanupPolecats(townRoot)
}
// Phase 6: Stop the daemon
fmt.Printf("\nPhase 6: Stopping daemon...\n")
// Phase 7: Stop the daemon
fmt.Printf("\nPhase 7: Stopping daemon...\n")
if townRoot != "" {
stopDaemonIfRunning(townRoot)
}
@@ -587,6 +603,13 @@ func runImmediateShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) er
deaconSession := getDeaconSessionName()
stopped := killSessionsInOrder(t, gtSessions, mayorSession, deaconSession)
// Cleanup orphaned Claude processes if requested
if shutdownCleanupOrphans {
fmt.Println()
fmt.Println("Cleaning up orphaned Claude processes...")
cleanupOrphanedClaude(shutdownCleanupOrphansGrace)
}
// Cleanup polecat worktrees and branches
if townRoot != "" {
fmt.Println()
@@ -612,6 +635,9 @@ func runImmediateShutdown(t *tmux.Tmux, gtSessions []string, townRoot string) er
// 2. Everything except Mayor
// 3. Mayor last
// mayorSession and deaconSession are the dynamic session names for the current town.
//
// Returns the count of sessions that were successfully stopped (verified by checking
// if the session no longer exists after the kill attempt).
func killSessionsInOrder(t *tmux.Tmux, sessions []string, mayorSession, deaconSession string) int {
stopped := 0
@@ -625,10 +651,31 @@ func killSessionsInOrder(t *tmux.Tmux, sessions []string, mayorSession, deaconSe
return false
}
// Helper to kill a session and verify it was stopped
killAndVerify := func(sess string) bool {
// Check if session exists before attempting to kill
exists, _ := t.HasSession(sess)
if !exists {
return false // Session already gone
}
// Attempt to kill the session and its processes
_ = t.KillSessionWithProcesses(sess)
// Verify the session is actually gone (ignore error, check existence)
// KillSessionWithProcesses might return an error even if it successfully
// killed the processes and the session auto-closed
stillExists, _ := t.HasSession(sess)
if !stillExists {
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), sess)
return true
}
return false
}
// 1. Stop Deacon first
if inList(deaconSession) {
if err := t.KillSessionWithProcesses(deaconSession); err == nil {
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), deaconSession)
if killAndVerify(deaconSession) {
stopped++
}
}
@@ -638,16 +685,14 @@ func killSessionsInOrder(t *tmux.Tmux, sessions []string, mayorSession, deaconSe
if sess == deaconSession || sess == mayorSession {
continue
}
if err := t.KillSessionWithProcesses(sess); err == nil {
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), sess)
if killAndVerify(sess) {
stopped++
}
}
// 3. Stop Mayor last
if inList(mayorSession) {
if err := t.KillSessionWithProcesses(mayorSession); err == nil {
fmt.Printf(" %s %s stopped\n", style.Bold.Render("✓"), mayorSession)
if killAndVerify(mayorSession) {
stopped++
}
}
@@ -920,3 +965,79 @@ func startCrewMember(rigName, crewName, townRoot string) error {
return nil
}
// cleanupOrphanedClaude finds and kills orphaned Claude processes with a grace period.
// This is a simpler synchronous implementation that:
// 1. Finds orphaned processes (TTY-less, older than 60s, not in Gas Town sessions)
// 2. Sends SIGTERM to all of them
// 3. Waits for the grace period
// 4. Sends SIGKILL to any that are still alive
func cleanupOrphanedClaude(graceSecs int) {
// Find orphaned processes
orphans, err := util.FindOrphanedClaudeProcesses()
if err != nil {
fmt.Printf(" %s Warning: %v\n", style.Bold.Render("⚠"), err)
return
}
if len(orphans) == 0 {
fmt.Printf(" %s No orphaned processes found\n", style.Dim.Render("○"))
return
}
// Send SIGTERM to all orphans
var termPIDs []int
for _, orphan := range orphans {
if err := syscall.Kill(orphan.PID, syscall.SIGTERM); err != nil {
if err != syscall.ESRCH {
fmt.Printf(" %s PID %d: failed to send SIGTERM: %v\n",
style.Bold.Render("⚠"), orphan.PID, err)
}
continue
}
termPIDs = append(termPIDs, orphan.PID)
fmt.Printf(" %s PID %d: sent SIGTERM (waiting %ds before SIGKILL)\n",
style.Bold.Render("→"), orphan.PID, graceSecs)
}
if len(termPIDs) == 0 {
return
}
// Wait for grace period
fmt.Printf(" %s Waiting %d seconds for processes to terminate gracefully...\n",
style.Dim.Render("⏳"), graceSecs)
time.Sleep(time.Duration(graceSecs) * time.Second)
// Check which processes are still alive and send SIGKILL
var killedCount, alreadyDeadCount int
for _, pid := range termPIDs {
// Check if process still exists
if err := syscall.Kill(pid, 0); err != nil {
// Process is gone (either died from SIGTERM or doesn't exist)
alreadyDeadCount++
continue
}
// Process still alive - send SIGKILL
if err := syscall.Kill(pid, syscall.SIGKILL); err != nil {
if err != syscall.ESRCH {
fmt.Printf(" %s PID %d: failed to send SIGKILL: %v\n",
style.Bold.Render("⚠"), pid, err)
}
continue
}
killedCount++
fmt.Printf(" %s PID %d: sent SIGKILL (did not respond to SIGTERM)\n",
style.Bold.Render("✓"), pid)
}
if alreadyDeadCount > 0 {
fmt.Printf(" %s %d process(es) terminated gracefully from SIGTERM\n",
style.Bold.Render("✓"), alreadyDeadCount)
}
if killedCount == 0 && alreadyDeadCount > 0 {
fmt.Printf(" %s All processes cleaned up successfully\n",
style.Bold.Render("✓"))
}
}