From 6f1b6269b13bf85d07dadb9f41e17d1d8bd45a88 Mon Sep 17 00:00:00 2001 From: gastown/polecats/nux Date: Wed, 31 Dec 2025 02:01:21 -0800 Subject: [PATCH] Fix Deacon spin indefinitely bug (hq-oosxt) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add heartbeat checking to Boot degraded triage: detects stale Deacon heartbeat (>15min nudges, >30min restarts) - Add checkDeaconHeartbeat to daemon heartbeat cycle as fallback - This ensures the Deacon is monitored continuously, not just at startup The mol-deacon-patrol formula was also updated separately to use gt deacon health-check instead of ephemeral context memory tracking. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- internal/cmd/boot.go | 28 ++++++++++++++++++--- internal/daemon/daemon.go | 53 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) diff --git a/internal/cmd/boot.go b/internal/cmd/boot.go index 6becfd17..2ac12332 100644 --- a/internal/cmd/boot.go +++ b/internal/cmd/boot.go @@ -8,6 +8,7 @@ import ( "github.com/spf13/cobra" "github.com/steveyegge/gastown/internal/boot" + "github.com/steveyegge/gastown/internal/deacon" "github.com/steveyegge/gastown/internal/style" "github.com/steveyegge/gastown/internal/workspace" ) @@ -288,9 +289,30 @@ func runDegradedTriage(b *boot.Boot) (action, target string, err error) { return "report", "deacon-missing", nil } - // Deacon exists - check if it's responsive (basic pane output check) - // In degraded mode, we can't do sophisticated analysis - // Just verify the session is alive + // Deacon exists - check heartbeat to detect stuck sessions + // A session can exist but be stuck (not making progress) + townRoot, _ := workspace.FindFromCwd() + if townRoot != "" { + hb := deacon.ReadHeartbeat(townRoot) + if hb.ShouldPoke() { + // Heartbeat is stale (>15 min) - Deacon is stuck + // Nudge the session to try to wake it up + age := hb.Age() + if age > 30*time.Minute { + // Very stuck - restart the session + fmt.Printf("Deacon heartbeat is %s old - restarting session\n", age.Round(time.Minute)) + if err := tm.KillSession(deaconSession); err == nil { + return "restart", "deacon-stuck", nil + } + } else { + // Stuck but not critically - try nudging first + fmt.Printf("Deacon heartbeat is %s old - nudging session\n", age.Round(time.Minute)) + _ = tm.NudgeSession(deaconSession, "HEALTH_CHECK: heartbeat is stale, respond to confirm responsiveness") + return "nudge", "deacon-stale", nil + } + } + } + return "nothing", "", nil } diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go index 1dfc44e3..028717b4 100644 --- a/internal/daemon/daemon.go +++ b/internal/daemon/daemon.go @@ -17,6 +17,7 @@ import ( "github.com/steveyegge/gastown/internal/boot" "github.com/steveyegge/gastown/internal/config" "github.com/steveyegge/gastown/internal/constants" + "github.com/steveyegge/gastown/internal/deacon" "github.com/steveyegge/gastown/internal/feed" "github.com/steveyegge/gastown/internal/polecat" "github.com/steveyegge/gastown/internal/tmux" @@ -148,6 +149,10 @@ func (d *Daemon) heartbeat(state *State) { // Boot handles the "when to wake Deacon" decision via triage logic d.ensureBootRunning() + // 1b. Direct Deacon heartbeat check (belt-and-suspenders) + // Boot may not detect all stuck states; this provides a fallback + d.checkDeaconHeartbeat() + // 2. Ensure Witnesses are running for all rigs (restart if dead) d.ensureWitnessesRunning() @@ -293,6 +298,54 @@ func (d *Daemon) ensureDeaconRunning() { d.logger.Println("Deacon session started successfully") } +// checkDeaconHeartbeat checks if the Deacon is making progress. +// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states. +// Uses the heartbeat file that the Deacon updates on each patrol cycle. +func (d *Daemon) checkDeaconHeartbeat() { + hb := deacon.ReadHeartbeat(d.config.TownRoot) + if hb == nil { + // No heartbeat file - Deacon hasn't started a cycle yet + return + } + + age := hb.Age() + + // If heartbeat is very stale (>15 min), the Deacon is likely stuck + if !hb.ShouldPoke() { + // Heartbeat is fresh enough + return + } + + d.logger.Printf("Deacon heartbeat is stale (%s old), checking session...", age.Round(time.Minute)) + + // Check if session exists + hasSession, err := d.tmux.HasSession(DeaconSessionName) + if err != nil { + d.logger.Printf("Error checking Deacon session: %v", err) + return + } + + if !hasSession { + // Session doesn't exist - ensureBootRunning will handle restart + return + } + + // Session exists but heartbeat is stale - Deacon is stuck + if age > 30*time.Minute { + // Very stuck - restart the session + d.logger.Printf("Deacon stuck for %s - restarting session", age.Round(time.Minute)) + if err := d.tmux.KillSession(DeaconSessionName); err != nil { + d.logger.Printf("Error killing stuck Deacon: %v", err) + } + // ensureDeaconRunning will be called next heartbeat to restart + } else { + // Stuck but not critically - nudge to wake up + d.logger.Printf("Deacon stuck for %s - nudging session", age.Round(time.Minute)) + if err := d.tmux.NudgeSession(DeaconSessionName, "HEALTH_CHECK: heartbeat stale, respond to confirm responsiveness"); err != nil { + d.logger.Printf("Error nudging stuck Deacon: %v", err) + } + } +} // ensureWitnessesRunning ensures witnesses are running for all rigs. // Called on each heartbeat to maintain witness patrol loops.