fix(daemon): restore ensureDeaconRunning to heartbeat and use Manager

The heartbeat now explicitly calls ensureDeaconRunning() for basic
"is Deacon alive" checks, while Boot handles intelligent triage
(stuck/nudge/interrupt decisions).

Changed ensureDeaconRunning to use deacon.Manager.Start() instead of
duplicating startup logic. This gives daemon the same benefits:
- WaitForShellReady (fixes race condition)
- Claude settings setup
- Theming
- StartupNudge and PropulsionNudge (GUPP)

Heartbeat order:
1. ensureDeaconRunning - restart if dead (via Manager)
2. ensureBootRunning - intelligent triage for stuck states
3. checkDeaconHeartbeat - belt-and-suspenders fallback
4-11. Other checks (witnesses, refineries, polecats, etc.)

This was inadvertently removed when Boot was introduced, which
delegated all Deacon checks to Boot. But Boot's mol doesn't actually
restart Deacon - it just reports. Now responsibilities are clear:
daemon ensures alive, Boot ensures responsive.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
julianknutsen
2026-01-07 19:10:09 -08:00
parent f4cbcb4ce9
commit 65ecb6cafd

View File

@@ -169,40 +169,40 @@ const recoveryHeartbeatInterval = 3 * time.Minute
func (d *Daemon) heartbeat(state *State) {
d.logger.Println("Heartbeat starting (recovery-focused)")
// 1. Poke Boot (the Deacon's watchdog) instead of Deacon directly
// Boot handles the "when to wake Deacon" decision via triage logic
// 1. Ensure Deacon is running (restart if dead)
d.ensureDeaconRunning()
// 2. Poke Boot for intelligent triage (stuck/nudge/interrupt)
// Boot handles nuanced "is Deacon responsive" decisions
d.ensureBootRunning()
// 1b. Direct Deacon heartbeat check (belt-and-suspenders)
// 3. Direct Deacon heartbeat check (belt-and-suspenders)
// Boot may not detect all stuck states; this provides a fallback
d.checkDeaconHeartbeat()
// 2. Ensure Witnesses are running for all rigs (restart if dead)
// 4. Ensure Witnesses are running for all rigs (restart if dead)
d.ensureWitnessesRunning()
// 2b. Ensure Refineries are running for all rigs (restart if dead)
// 5. Ensure Refineries are running for all rigs (restart if dead)
d.ensureRefineriesRunning()
// 3. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
// 6. Trigger pending polecat spawns (bootstrap mode - ZFC violation acceptable)
// This ensures polecats get nudged even when Deacon isn't in a patrol cycle.
// Uses regex-based WaitForClaudeReady, which is acceptable for daemon bootstrap.
d.triggerPendingSpawns()
// 4. Process lifecycle requests
// 7. Process lifecycle requests
d.processLifecycleRequests()
// 5. Stale agent check REMOVED (gt-zecmc)
// Was: d.checkStaleAgents() - marked agents "dead" based on bead update time.
// This violated "discover, don't track" - agent liveness is observable from tmux.
// The daemon now checks tmux directly in ensureXxxRunning() functions.
// 8. (Removed) Stale agent check - violated "discover, don't track"
// 6. Check for GUPP violations (agents with work-on-hook not progressing)
// 9. Check for GUPP violations (agents with work-on-hook not progressing)
d.checkGUPPViolations()
// 7. Check for orphaned work (assigned to dead agents)
// 10. Check for orphaned work (assigned to dead agents)
d.checkOrphanedWork()
// 8. Check polecat session health (proactive crash detection)
// 11. Check polecat session health (proactive crash detection)
// This validates tmux sessions are still alive for polecats with work-on-hook
d.checkPolecatSessionHealth()
@@ -291,51 +291,20 @@ func (d *Daemon) runDegradedBootTriage(b *boot.Boot) {
}
// ensureDeaconRunning ensures the Deacon is running.
// Discover, don't track: checks tmux directly instead of bead state (gt-zecmc).
// The Deacon is the system's heartbeat - it must always be running.
// Uses deacon.Manager for consistent startup behavior (WaitForShellReady, GUPP, etc.).
func (d *Daemon) ensureDeaconRunning() {
deaconSession := d.getDeaconSessionName()
mgr := deacon.NewManager(d.config.TownRoot)
// Check if tmux session exists and Claude is running (observable reality)
hasSession, sessionErr := d.tmux.HasSession(deaconSession)
if sessionErr == nil && hasSession {
if d.tmux.IsClaudeRunning(deaconSession) {
if err := mgr.Start(""); err != nil {
if err == deacon.ErrAlreadyRunning {
// Deacon is running - nothing to do
return
}
// Session exists but Claude not running - zombie session, kill it
d.logger.Println("Deacon session exists but Claude not running, killing zombie session...")
if err := d.tmux.KillSession(deaconSession); err != nil {
d.logger.Printf("Warning: failed to kill zombie Deacon session: %v", err)
}
// Fall through to restart
}
// Deacon not running - start it
d.logger.Println("Deacon not running, starting...")
// Create session in deacon directory (ensures correct CLAUDE.md is loaded)
// Use EnsureSessionFresh to handle zombie sessions that exist but have dead Claude
deaconDir := filepath.Join(d.config.TownRoot, "deacon")
sessionName := d.getDeaconSessionName()
if err := d.tmux.EnsureSessionFresh(sessionName, deaconDir); err != nil {
d.logger.Printf("Error creating Deacon session: %v", err)
d.logger.Printf("Error starting Deacon: %v", err)
return
}
// Set environment (non-fatal: session works without these)
_ = d.tmux.SetEnvironment(sessionName, "GT_ROLE", "deacon")
_ = d.tmux.SetEnvironment(sessionName, "BD_ACTOR", "deacon")
// Launch Claude directly (no shell respawn loop)
// The daemon will detect if Claude exits and restart it on next heartbeat
// Export GT_ROLE and BD_ACTOR so Claude inherits them (tmux SetEnvironment doesn't export to processes)
if err := d.tmux.SendKeys(sessionName, config.BuildAgentStartupCommand("deacon", "deacon", "", "")); err != nil {
d.logger.Printf("Error launching Claude in Deacon session: %v", err)
return
}
d.logger.Println("Deacon session started successfully")
d.logger.Println("Deacon started successfully")
}
// checkDeaconHeartbeat checks if the Deacon is making progress.
@@ -368,7 +337,8 @@ func (d *Daemon) checkDeaconHeartbeat() {
}
if !hasSession {
// Session doesn't exist - ensureBootRunning will handle restart
// Session doesn't exist - ensureDeaconRunning already ran earlier
// in heartbeat, so Deacon should be starting
return
}
@@ -379,7 +349,7 @@ func (d *Daemon) checkDeaconHeartbeat() {
if err := d.tmux.KillSession(sessionName); err != nil {
d.logger.Printf("Error killing stuck Deacon: %v", err)
}
// ensureDeaconRunning will be called next heartbeat to restart
// ensureDeaconRunning will restart on next heartbeat
} else {
// Stuck but not critically - nudge to wake up
d.logger.Printf("Deacon stuck for %s - nudging session", age.Round(time.Minute))