From 3442471a93a322f9d529daef40678aee5e5f2b06 Mon Sep 17 00:00:00 2001 From: Basit Mustafa Date: Sat, 24 Jan 2026 22:45:12 -0700 Subject: [PATCH] fix(orphan): protect all tmux sessions, not just Gas Town ones (#924) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add hung-session-detection step to deacon patrol Detects and surgically recovers Gas Town sessions where Claude API call is stuck indefinitely. These appear "running" (tmux session exists) but aren't processing work. Safety checks (ALL must pass before recovery): 1. Session matches Gas Town pattern exactly (gt-*-witness, etc) 2. Session shows waiting state (Clauding/Deciphering/etc) 3. Duration >30min AND (zero tokens OR duration >2hrs) 4. NOT showing active tool execution (⏺ markers) This closes a gap where existing zombie-scan only catches processes not in tmux sessions. Co-Authored-By: Claude * fix(orphan): protect all tmux sessions, not just Gas Town ones The orphan cleanup was killing Claude processes in user's personal tmux sessions (e.g., "loomtown", "yaad") because only sessions with gt-* or hq-* prefixes were protected. Changes: - Renamed getGasTownSessionPIDs() to getTmuxSessionPIDs() - Now protects ALL tmux sessions regardless of name prefix - Updated variable names for clarity (gasTownPIDs -> protectedPIDs) The TTY="?" check is not reliable during certain operations (startup, session transitions), so explicit protection of all tmux sessions is necessary to prevent killing user's personal Claude instances. Fixes #923 Co-Authored-By: Claude --------- Co-authored-by: mayor Co-authored-by: Claude --- .../formulas/mol-deacon-patrol.formula.toml | 106 +++++++++++++++++- internal/util/orphan.go | 34 +++--- 2 files changed, 124 insertions(+), 16 deletions(-) diff --git a/internal/formula/formulas/mol-deacon-patrol.formula.toml b/internal/formula/formulas/mol-deacon-patrol.formula.toml index f293c2b3..c60be7b4 100644 --- a/internal/formula/formulas/mol-deacon-patrol.formula.toml +++ b/internal/formula/formulas/mol-deacon-patrol.formula.toml @@ -419,10 +419,114 @@ gt mail send mayor/ -s "Health: unresponsive" \\ Reset unresponsive_cycles to 0 when component responds normally.""" +[[steps]] +id = "hung-session-detection" +title = "Detect and recover hung Gas Town sessions (SURGICAL)" +needs = ["health-scan"] +description = """ +Detect and surgically recover hung Gas Town sessions where the Claude API call is stuck. + +A hung session appears "running" (tmux session exists, Claude process exists) but +the API call has been stuck indefinitely. This breaks patrol chains - if witness +hangs, refinery never gets nudged about new MRs. + +**Why existing checks miss this:** +- zombie-scan only catches processes not in tmux sessions +- gt status shows "running" if tmux session exists +- Nudges queue but never get processed (Claude can't respond) + +## SURGICAL TARGETING + +**ONLY these session patterns are valid targets:** +- `gt--witness` (e.g., gt-kalshi-witness, gt-horizon-witness) +- `gt--refinery` (e.g., gt-kalshi-refinery) +- `hq-deacon` + +**NEVER touch sessions that don't match these patterns exactly.** + +## DETECTION (All checks must pass) + +For each Gas Town session, capture output and verify ALL of these: + +```bash +# Step 1: Get session output +output=$(tmux capture-pane -t -p 2>/dev/null | tail -10) +``` + +**Check 1: Session is in waiting state** +Must see one of: `Clauding`, `Deciphering`, `Marinating`, `Finagling`, `thinking` +```bash +echo "$output" | grep -qiE 'Clauding|Deciphering|Marinating|Finagling|thinking' +``` + +**Check 2: Duration exceeds threshold (30+ minutes)** +Parse duration from output like "21h 35m 20s" or "45m 30s": +```bash +# Extract hours and minutes +hours=$(echo "$output" | grep -oE '[0-9]+h' | head -1 | tr -d 'h') +minutes=$(echo "$output" | grep -oE '[0-9]+m' | head -1 | tr -d 'm') +total_minutes=$((${hours:-0} * 60 + ${minutes:-0})) +# Threshold: 30 minutes minimum +[ "$total_minutes" -ge 30 ] +``` + +**Check 3: Zero tokens received (definite hang) OR very long duration (>2 hours)** +```bash +# Definite hang: zero tokens received +echo "$output" | grep -qE '↓ 0 tokens' +# OR extremely long duration (>2 hours = 120 minutes) +[ "$total_minutes" -ge 120 ] +``` + +**Check 4: NOT showing active tool execution** +Active sessions show tool markers (⏺). If present, session is actually working: +```bash +# If tool markers present in recent output, DO NOT kill +echo "$output" | grep -qE '⏺|Read|Write|Bash|Edit' && continue +``` + +## RECOVERY (Only after ALL checks pass) + +**Log the action first:** +```bash +echo "[$(date)] RECOVERING HUNG: (${hours}h ${minutes}m, waiting state)" >> $GT_ROOT/logs/hung-sessions.log +``` + +**Kill and restart based on session type:** + +For witness: +```bash +tmux kill-session -t gt--witness 2>/dev/null +gt witness start +``` + +For refinery: +```bash +tmux kill-session -t gt--refinery 2>/dev/null +gt refinery restart +``` + +For deacon (self-recovery - use with caution): +```bash +# Deacon detecting itself is hung is a paradox +# Only kill if another deacon instance exists or human confirmed +gt mail send mayor/ -s "DEACON SELF-HUNG DETECTED" -m "Deacon appears hung. Human intervention required." +``` + +## VERIFICATION + +After restart, verify new session is healthy: +```bash +sleep 5 +tmux has-session -t && echo "Session restarted successfully" +``` + +**Exit criteria:** All hung Gas Town sessions detected and recovered (or escalated if recovery failed).""" + [[steps]] id = "zombie-scan" title = "Detect zombie polecats (NO KILL AUTHORITY)" -needs = ["health-scan"] +needs = ["hung-session-detection"] description = """ Defense-in-depth DETECTION of zombie polecats that Witness should have cleaned. diff --git a/internal/util/orphan.go b/internal/util/orphan.go index c0343542..08480b1a 100644 --- a/internal/util/orphan.go +++ b/internal/util/orphan.go @@ -19,27 +19,31 @@ import ( // processes and avoids killing legitimate short-lived subagents. const minOrphanAge = 60 -// getGasTownSessionPIDs returns a set of PIDs belonging to valid Gas Town tmux sessions. -// This prevents killing Claude processes that are part of witness/refinery/deacon sessions +// getTmuxSessionPIDs returns a set of PIDs belonging to ANY tmux session. +// This prevents killing Claude processes that are running in tmux sessions, // even if they temporarily show TTY "?" during startup or session transitions. -func getGasTownSessionPIDs() map[int]bool { +// +// CRITICAL: We protect ALL tmux sessions, not just Gas Town ones (gt-*, hq-*). +// User's personal Claude sessions (e.g., in sessions named "loomtown", "yaad") +// must never be killed by orphan cleanup. The TTY="?" check is not reliable +// during certain operations, so we must explicitly protect all tmux processes. +func getTmuxSessionPIDs() map[int]bool { pids := make(map[int]bool) - // Get list of Gas Town tmux sessions (gt-* and hq-*) + // Get list of ALL tmux sessions (not just gt-*/hq-*) out, err := exec.Command("tmux", "list-sessions", "-F", "#{session_name}").Output() if err != nil { return pids // tmux not available or no sessions } - var gasTownSessions []string - for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { - if strings.HasPrefix(line, "gt-") || strings.HasPrefix(line, "hq-") { - gasTownSessions = append(gasTownSessions, line) - } - } + // Protect ALL sessions - user's personal sessions are just as important + sessions := strings.Split(strings.TrimSpace(string(out)), "\n") - // For each Gas Town session, get the PIDs of processes in its panes - for _, session := range gasTownSessions { + // For each session, get the PIDs of processes in its panes + for _, session := range sessions { + if session == "" { + continue + } out, err := exec.Command("tmux", "list-panes", "-t", session, "-F", "#{pane_pid}").Output() if err != nil { continue @@ -285,7 +289,7 @@ type OrphanedProcess struct { func FindOrphanedClaudeProcesses() ([]OrphanedProcess, error) { // Get PIDs belonging to valid Gas Town tmux sessions. // These should not be killed even if they show TTY "?" during startup. - gasTownPIDs := getGasTownSessionPIDs() + protectedPIDs := getTmuxSessionPIDs() // Use ps to get PID, TTY, command, and elapsed time for all processes // TTY "?" indicates no controlling terminal @@ -326,7 +330,7 @@ func FindOrphanedClaudeProcesses() ([]OrphanedProcess, error) { // Skip processes that belong to valid Gas Town tmux sessions. // This prevents killing witnesses/refineries/deacon during startup // when they may temporarily show TTY "?". - if gasTownPIDs[pid] { + if protectedPIDs[pid] { continue } @@ -375,7 +379,7 @@ type ZombieProcess struct { // This is the definitive zombie check because it verifies against tmux reality. func FindZombieClaudeProcesses() ([]ZombieProcess, error) { // Get ALL valid PIDs (panes + their children) from active tmux sessions - validPIDs := getGasTownSessionPIDs() + validPIDs := getTmuxSessionPIDs() // SAFETY CHECK: If no valid PIDs found, tmux might be down or no sessions exist. // Returning empty is safer than marking all Claude processes as zombies.