Fix Deacon spin indefinitely bug (hq-oosxt)

- Add heartbeat checking to Boot degraded triage: detects stale Deacon
  heartbeat (>15min nudges, >30min restarts)
- Add checkDeaconHeartbeat to daemon heartbeat cycle as fallback
- This ensures the Deacon is monitored continuously, not just at startup

The mol-deacon-patrol formula was also updated separately to use
gt deacon health-check instead of ephemeral context memory tracking.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/polecats/nux
2025-12-31 02:01:21 -08:00
committed by Steve Yegge
parent 4ea2b719ba
commit 6f1b6269b1
2 changed files with 78 additions and 3 deletions

View File

@@ -8,6 +8,7 @@ import (
"github.com/spf13/cobra"
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/style"
"github.com/steveyegge/gastown/internal/workspace"
)
@@ -288,9 +289,30 @@ func runDegradedTriage(b *boot.Boot) (action, target string, err error) {
return "report", "deacon-missing", nil
}
// Deacon exists - check if it's responsive (basic pane output check)
// In degraded mode, we can't do sophisticated analysis
// Just verify the session is alive
// Deacon exists - check heartbeat to detect stuck sessions
// A session can exist but be stuck (not making progress)
townRoot, _ := workspace.FindFromCwd()
if townRoot != "" {
hb := deacon.ReadHeartbeat(townRoot)
if hb.ShouldPoke() {
// Heartbeat is stale (>15 min) - Deacon is stuck
// Nudge the session to try to wake it up
age := hb.Age()
if age > 30*time.Minute {
// Very stuck - restart the session
fmt.Printf("Deacon heartbeat is %s old - restarting session\n", age.Round(time.Minute))
if err := tm.KillSession(deaconSession); err == nil {
return "restart", "deacon-stuck", nil
}
} else {
// Stuck but not critically - try nudging first
fmt.Printf("Deacon heartbeat is %s old - nudging session\n", age.Round(time.Minute))
_ = tm.NudgeSession(deaconSession, "HEALTH_CHECK: heartbeat is stale, respond to confirm responsiveness")
return "nudge", "deacon-stale", nil
}
}
}
return "nothing", "", nil
}

View File

@@ -17,6 +17,7 @@ import (
"github.com/steveyegge/gastown/internal/boot"
"github.com/steveyegge/gastown/internal/config"
"github.com/steveyegge/gastown/internal/constants"
"github.com/steveyegge/gastown/internal/deacon"
"github.com/steveyegge/gastown/internal/feed"
"github.com/steveyegge/gastown/internal/polecat"
"github.com/steveyegge/gastown/internal/tmux"
@@ -148,6 +149,10 @@ func (d *Daemon) heartbeat(state *State) {
// Boot handles the "when to wake Deacon" decision via triage logic
d.ensureBootRunning()
// 1b. Direct Deacon heartbeat check (belt-and-suspenders)
// Boot may not detect all stuck states; this provides a fallback
d.checkDeaconHeartbeat()
// 2. Ensure Witnesses are running for all rigs (restart if dead)
d.ensureWitnessesRunning()
@@ -293,6 +298,54 @@ func (d *Daemon) ensureDeaconRunning() {
d.logger.Println("Deacon session started successfully")
}
// checkDeaconHeartbeat checks if the Deacon is making progress.
// This is a belt-and-suspenders fallback in case Boot doesn't detect stuck states.
// Uses the heartbeat file that the Deacon updates on each patrol cycle.
func (d *Daemon) checkDeaconHeartbeat() {
hb := deacon.ReadHeartbeat(d.config.TownRoot)
if hb == nil {
// No heartbeat file - Deacon hasn't started a cycle yet
return
}
age := hb.Age()
// If heartbeat is very stale (>15 min), the Deacon is likely stuck
if !hb.ShouldPoke() {
// Heartbeat is fresh enough
return
}
d.logger.Printf("Deacon heartbeat is stale (%s old), checking session...", age.Round(time.Minute))
// Check if session exists
hasSession, err := d.tmux.HasSession(DeaconSessionName)
if err != nil {
d.logger.Printf("Error checking Deacon session: %v", err)
return
}
if !hasSession {
// Session doesn't exist - ensureBootRunning will handle restart
return
}
// Session exists but heartbeat is stale - Deacon is stuck
if age > 30*time.Minute {
// Very stuck - restart the session
d.logger.Printf("Deacon stuck for %s - restarting session", age.Round(time.Minute))
if err := d.tmux.KillSession(DeaconSessionName); err != nil {
d.logger.Printf("Error killing stuck Deacon: %v", err)
}
// ensureDeaconRunning will be called next heartbeat to restart
} else {
// Stuck but not critically - nudge to wake up
d.logger.Printf("Deacon stuck for %s - nudging session", age.Round(time.Minute))
if err := d.tmux.NudgeSession(DeaconSessionName, "HEALTH_CHECK: heartbeat stale, respond to confirm responsiveness"); err != nil {
d.logger.Printf("Error nudging stuck Deacon: %v", err)
}
}
}
// ensureWitnessesRunning ensures witnesses are running for all rigs.
// Called on each heartbeat to maintain witness patrol loops.