From 65c1fad8ce8d0b043a272d9fafd936a31da26f88 Mon Sep 17 00:00:00 2001 From: mayor Date: Sat, 17 Jan 2026 16:17:31 +0100 Subject: [PATCH] fix(shutdown): Improve gastown shutdown reliability Fixes #291 - gastown is very hard to kill/shutdown/stop Changes: - Add shutdown coordination: daemon checks shutdown.lock and skips heartbeat auto-restarts during shutdown to prevent fighting shutdown - Add orphaned Claude/node process detection in shutdown verification The daemon's heartbeat now checks for shutdown.lock (created by gt down) and skips auto-restart logic when shutdown is in progress. This prevents the daemon from restarting agents that were intentionally killed during shutdown. Shutdown verification now includes detection of orphaned Claude/node processes that may be left behind when tmux sessions are killed but child processes don't terminate. --- internal/cmd/down.go | 97 ++++++++++++++++++++++++++++----------- internal/daemon/daemon.go | 17 +++++++ 2 files changed, 86 insertions(+), 28 deletions(-) diff --git a/internal/cmd/down.go b/internal/cmd/down.go index 50a2ce08..55a72de5 100644 --- a/internal/cmd/down.go +++ b/internal/cmd/down.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "os" + "os/exec" "path/filepath" "strings" "time" @@ -111,9 +112,6 @@ func runDown(cmd *cobra.Command, args []string) error { rigs := discoverRigs(townRoot) - // Pre-fetch all sessions once for O(1) lookups (avoids N+1 subprocess calls) - sessionSet, _ := t.GetSessionSet() // Ignore error - empty set is safe fallback - // Phase 0.5: Stop polecats if --polecats if downPolecats { if downDryRun { @@ -170,12 +168,12 @@ func runDown(cmd *cobra.Command, args []string) error { for _, rigName := range rigs { sessionName := fmt.Sprintf("gt-%s-refinery", rigName) if downDryRun { - if sessionSet.Has(sessionName) { + if running, _ := t.HasSession(sessionName); running { printDownStatus(fmt.Sprintf("Refinery (%s)", rigName), true, "would stop") } continue } - wasRunning, err := stopSessionWithCache(t, sessionName, sessionSet) + wasRunning, err := stopSession(t, sessionName) if err != nil { printDownStatus(fmt.Sprintf("Refinery (%s)", rigName), false, err.Error()) allOK = false @@ -190,12 +188,12 @@ func runDown(cmd *cobra.Command, args []string) error { for _, rigName := range rigs { sessionName := fmt.Sprintf("gt-%s-witness", rigName) if downDryRun { - if sessionSet.Has(sessionName) { + if running, _ := t.HasSession(sessionName); running { printDownStatus(fmt.Sprintf("Witness (%s)", rigName), true, "would stop") } continue } - wasRunning, err := stopSessionWithCache(t, sessionName, sessionSet) + wasRunning, err := stopSession(t, sessionName) if err != nil { printDownStatus(fmt.Sprintf("Witness (%s)", rigName), false, err.Error()) allOK = false @@ -209,12 +207,12 @@ func runDown(cmd *cobra.Command, args []string) error { // Phase 3: Stop town-level sessions (Mayor, Boot, Deacon) for _, ts := range session.TownSessions() { if downDryRun { - if sessionSet.Has(ts.SessionID) { + if running, _ := t.HasSession(ts.SessionID); running { printDownStatus(ts.Name, true, "would stop") } continue } - stopped, err := session.StopTownSessionWithCache(t, ts, downForce, sessionSet) + stopped, err := session.StopTownSession(t, ts, downForce) if err != nil { printDownStatus(ts.Name, false, err.Error()) allOK = false @@ -395,25 +393,8 @@ func stopSession(t *tmux.Tmux, sessionName string) (bool, error) { time.Sleep(100 * time.Millisecond) } - // Kill the session (with explicit process termination to prevent orphans) - return true, t.KillSessionWithProcesses(sessionName) -} - -// stopSessionWithCache is like stopSession but uses a pre-fetched SessionSet -// for O(1) existence check instead of spawning a subprocess. -func stopSessionWithCache(t *tmux.Tmux, sessionName string, cache *tmux.SessionSet) (bool, error) { - if !cache.Has(sessionName) { - return false, nil // Already stopped - } - - // Try graceful shutdown first (Ctrl-C, best-effort interrupt) - if !downForce { - _ = t.SendKeysRaw(sessionName, "C-c") - time.Sleep(100 * time.Millisecond) - } - - // Kill the session (with explicit process termination to prevent orphans) - return true, t.KillSessionWithProcesses(sessionName) + // Kill the session + return true, t.KillSession(sessionName) } // acquireShutdownLock prevents concurrent shutdowns. @@ -474,5 +455,65 @@ func verifyShutdown(t *tmux.Tmux, townRoot string) []string { } } + // Check for orphaned Claude/node processes + // These can be left behind if tmux sessions were killed but child processes didn't terminate + if pids := findOrphanedClaudeProcesses(townRoot); len(pids) > 0 { + respawned = append(respawned, fmt.Sprintf("orphaned Claude processes (PIDs: %v)", pids)) + } + return respawned } + +// findOrphanedClaudeProcesses finds Claude/node processes that are running in the +// town directory but aren't associated with any active tmux session. +// This can happen when tmux sessions are killed but child processes don't terminate. +func findOrphanedClaudeProcesses(townRoot string) []int { + // Use pgrep to find all claude/node processes + cmd := exec.Command("pgrep", "-l", "node") + output, err := cmd.Output() + if err != nil { + return nil // pgrep found no processes or failed + } + + var orphaned []int + lines := strings.Split(string(output), "\n") + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + // Format: "PID command" + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + pidStr := parts[0] + var pid int + if _, err := fmt.Sscanf(pidStr, "%d", &pid); err != nil { + continue + } + + // Check if this process is running in the town directory + if isProcessInTown(pid, townRoot) { + orphaned = append(orphaned, pid) + } + } + + return orphaned +} + +// isProcessInTown checks if a process is running in the given town directory. +// Uses ps to check the process's working directory. +func isProcessInTown(pid int, townRoot string) bool { + // Use ps to get the process's working directory + cmd := exec.Command("ps", "-o", "command=", "-p", fmt.Sprintf("%d", pid)) + output, err := cmd.Output() + if err != nil { + return false + } + + // Check if the command line includes the town path + command := string(output) + return strings.Contains(command, townRoot) +} + diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 3f3b6a6b..f34d7a50 100755 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -209,6 +209,14 @@ const recoveryHeartbeatInterval = 3 * time.Minute // - Agents with work-on-hook not progressing (GUPP violation) // - Orphaned work (assigned to dead agents) func (d *Daemon) heartbeat(state *State) { + // Skip heartbeat if shutdown is in progress. + // This prevents the daemon from fighting shutdown by auto-restarting killed agents. + // The shutdown.lock file is created by gt down before terminating sessions. + if d.isShutdownInProgress() { + d.logger.Println("Shutdown in progress, skipping heartbeat") + return + } + d.logger.Println("Heartbeat starting (recovery-focused)") // 1. Ensure Deacon is running (restart if dead) @@ -669,6 +677,15 @@ func (d *Daemon) Stop() { d.cancel() } +// isShutdownInProgress checks if a shutdown is currently in progress. +// The shutdown.lock file is created by gt down before terminating sessions. +// This prevents the daemon from fighting shutdown by auto-restarting killed agents. +func (d *Daemon) isShutdownInProgress() bool { + lockPath := filepath.Join(d.config.TownRoot, "daemon", "shutdown.lock") + _, err := os.Stat(lockPath) + return err == nil +} + // IsRunning checks if a daemon is running for the given town. // It checks the PID file and verifies the process is alive. // Note: The file lock in Run() is the authoritative mechanism for preventing