diff --git a/internal/formula/formulas/mol-deacon-patrol.formula.toml b/internal/formula/formulas/mol-deacon-patrol.formula.toml index f293c2b3..c60be7b4 100644 --- a/internal/formula/formulas/mol-deacon-patrol.formula.toml +++ b/internal/formula/formulas/mol-deacon-patrol.formula.toml @@ -419,10 +419,114 @@ gt mail send mayor/ -s "Health: unresponsive" \\ Reset unresponsive_cycles to 0 when component responds normally.""" +[[steps]] +id = "hung-session-detection" +title = "Detect and recover hung Gas Town sessions (SURGICAL)" +needs = ["health-scan"] +description = """ +Detect and surgically recover hung Gas Town sessions where the Claude API call is stuck. + +A hung session appears "running" (tmux session exists, Claude process exists) but +the API call has been stuck indefinitely. This breaks patrol chains - if witness +hangs, refinery never gets nudged about new MRs. + +**Why existing checks miss this:** +- zombie-scan only catches processes not in tmux sessions +- gt status shows "running" if tmux session exists +- Nudges queue but never get processed (Claude can't respond) + +## SURGICAL TARGETING + +**ONLY these session patterns are valid targets:** +- `gt--witness` (e.g., gt-kalshi-witness, gt-horizon-witness) +- `gt--refinery` (e.g., gt-kalshi-refinery) +- `hq-deacon` + +**NEVER touch sessions that don't match these patterns exactly.** + +## DETECTION (All checks must pass) + +For each Gas Town session, capture output and verify ALL of these: + +```bash +# Step 1: Get session output +output=$(tmux capture-pane -t -p 2>/dev/null | tail -10) +``` + +**Check 1: Session is in waiting state** +Must see one of: `Clauding`, `Deciphering`, `Marinating`, `Finagling`, `thinking` +```bash +echo "$output" | grep -qiE 'Clauding|Deciphering|Marinating|Finagling|thinking' +``` + +**Check 2: Duration exceeds threshold (30+ minutes)** +Parse duration from output like "21h 35m 20s" or "45m 30s": +```bash +# Extract hours and minutes +hours=$(echo "$output" | grep -oE '[0-9]+h' | head -1 | tr -d 'h') +minutes=$(echo "$output" | grep -oE '[0-9]+m' | head -1 | tr -d 'm') +total_minutes=$((${hours:-0} * 60 + ${minutes:-0})) +# Threshold: 30 minutes minimum +[ "$total_minutes" -ge 30 ] +``` + +**Check 3: Zero tokens received (definite hang) OR very long duration (>2 hours)** +```bash +# Definite hang: zero tokens received +echo "$output" | grep -qE '↓ 0 tokens' +# OR extremely long duration (>2 hours = 120 minutes) +[ "$total_minutes" -ge 120 ] +``` + +**Check 4: NOT showing active tool execution** +Active sessions show tool markers (⏺). If present, session is actually working: +```bash +# If tool markers present in recent output, DO NOT kill +echo "$output" | grep -qE '⏺|Read|Write|Bash|Edit' && continue +``` + +## RECOVERY (Only after ALL checks pass) + +**Log the action first:** +```bash +echo "[$(date)] RECOVERING HUNG: (${hours}h ${minutes}m, waiting state)" >> $GT_ROOT/logs/hung-sessions.log +``` + +**Kill and restart based on session type:** + +For witness: +```bash +tmux kill-session -t gt--witness 2>/dev/null +gt witness start +``` + +For refinery: +```bash +tmux kill-session -t gt--refinery 2>/dev/null +gt refinery restart +``` + +For deacon (self-recovery - use with caution): +```bash +# Deacon detecting itself is hung is a paradox +# Only kill if another deacon instance exists or human confirmed +gt mail send mayor/ -s "DEACON SELF-HUNG DETECTED" -m "Deacon appears hung. Human intervention required." +``` + +## VERIFICATION + +After restart, verify new session is healthy: +```bash +sleep 5 +tmux has-session -t && echo "Session restarted successfully" +``` + +**Exit criteria:** All hung Gas Town sessions detected and recovered (or escalated if recovery failed).""" + [[steps]] id = "zombie-scan" title = "Detect zombie polecats (NO KILL AUTHORITY)" -needs = ["health-scan"] +needs = ["hung-session-detection"] description = """ Defense-in-depth DETECTION of zombie polecats that Witness should have cleaned. diff --git a/internal/util/orphan.go b/internal/util/orphan.go index c0343542..08480b1a 100644 --- a/internal/util/orphan.go +++ b/internal/util/orphan.go @@ -19,27 +19,31 @@ import ( // processes and avoids killing legitimate short-lived subagents. const minOrphanAge = 60 -// getGasTownSessionPIDs returns a set of PIDs belonging to valid Gas Town tmux sessions. -// This prevents killing Claude processes that are part of witness/refinery/deacon sessions +// getTmuxSessionPIDs returns a set of PIDs belonging to ANY tmux session. +// This prevents killing Claude processes that are running in tmux sessions, // even if they temporarily show TTY "?" during startup or session transitions. -func getGasTownSessionPIDs() map[int]bool { +// +// CRITICAL: We protect ALL tmux sessions, not just Gas Town ones (gt-*, hq-*). +// User's personal Claude sessions (e.g., in sessions named "loomtown", "yaad") +// must never be killed by orphan cleanup. The TTY="?" check is not reliable +// during certain operations, so we must explicitly protect all tmux processes. +func getTmuxSessionPIDs() map[int]bool { pids := make(map[int]bool) - // Get list of Gas Town tmux sessions (gt-* and hq-*) + // Get list of ALL tmux sessions (not just gt-*/hq-*) out, err := exec.Command("tmux", "list-sessions", "-F", "#{session_name}").Output() if err != nil { return pids // tmux not available or no sessions } - var gasTownSessions []string - for _, line := range strings.Split(strings.TrimSpace(string(out)), "\n") { - if strings.HasPrefix(line, "gt-") || strings.HasPrefix(line, "hq-") { - gasTownSessions = append(gasTownSessions, line) - } - } + // Protect ALL sessions - user's personal sessions are just as important + sessions := strings.Split(strings.TrimSpace(string(out)), "\n") - // For each Gas Town session, get the PIDs of processes in its panes - for _, session := range gasTownSessions { + // For each session, get the PIDs of processes in its panes + for _, session := range sessions { + if session == "" { + continue + } out, err := exec.Command("tmux", "list-panes", "-t", session, "-F", "#{pane_pid}").Output() if err != nil { continue @@ -285,7 +289,7 @@ type OrphanedProcess struct { func FindOrphanedClaudeProcesses() ([]OrphanedProcess, error) { // Get PIDs belonging to valid Gas Town tmux sessions. // These should not be killed even if they show TTY "?" during startup. - gasTownPIDs := getGasTownSessionPIDs() + protectedPIDs := getTmuxSessionPIDs() // Use ps to get PID, TTY, command, and elapsed time for all processes // TTY "?" indicates no controlling terminal @@ -326,7 +330,7 @@ func FindOrphanedClaudeProcesses() ([]OrphanedProcess, error) { // Skip processes that belong to valid Gas Town tmux sessions. // This prevents killing witnesses/refineries/deacon during startup // when they may temporarily show TTY "?". - if gasTownPIDs[pid] { + if protectedPIDs[pid] { continue } @@ -375,7 +379,7 @@ type ZombieProcess struct { // This is the definitive zombie check because it verifies against tmux reality. func FindZombieClaudeProcesses() ([]ZombieProcess, error) { // Get ALL valid PIDs (panes + their children) from active tmux sessions - validPIDs := getGasTownSessionPIDs() + validPIDs := getTmuxSessionPIDs() // SAFETY CHECK: If no valid PIDs found, tmux might be down or no sessions exist. // Returning empty is safer than marking all Claude processes as zombies.