fix(daemon): nudge agents on state divergence instead of silent accept

When the daemon detects that an agent bead state doesn't match tmux
(e.g., bead says stopped but Claude is running), it now:

1. Logs the divergence clearly with STATE DIVERGENCE prefix
2. Nudges the agent with an actionable command to fix its state
3. Still skips the restart (safety - don't kill healthy sessions)

This prevents silent state drift where bead state diverges from reality.
Applied to: Deacon, Witness, Refinery ensure functions.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
gastown/crew/joe
2026-01-06 19:23:37 -08:00
committed by Steve Yegge
parent d89aae5b5c
commit 6dbb841e22

View File

@@ -325,7 +325,14 @@ func (d *Daemon) ensureDeaconRunning() {
hasSession, sessionErr := d.tmux.HasSession(deaconSession) hasSession, sessionErr := d.tmux.HasSession(deaconSession)
if sessionErr == nil && hasSession { if sessionErr == nil && hasSession {
if d.tmux.IsClaudeRunning(deaconSession) { if d.tmux.IsClaudeRunning(deaconSession) {
d.logger.Println("Deacon session healthy (Claude running), skipping restart despite stale bead") // STATE DIVERGENCE: tmux shows running but bead disagrees.
// Don't kill (safety), but nudge the agent to reconcile its state.
// This prevents silent state drift where bead and reality diverge.
d.logger.Printf("STATE DIVERGENCE: Deacon bead='%s' but Claude is running in tmux", beadState)
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + deaconSession + " running"
if err := d.tmux.NudgeSession(deaconSession, nudgeMsg); err != nil {
d.logger.Printf("Warning: failed to nudge Deacon about state divergence: %v", err)
}
return return
} }
} }
@@ -460,8 +467,13 @@ func (d *Daemon) ensureWitnessRunning(rigName string) {
if err := mgr.Start(false); err != nil { if err := mgr.Start(false); err != nil {
if err == witness.ErrAlreadyRunning { if err == witness.ErrAlreadyRunning {
// Session is healthy (Claude running) - bead state was stale // STATE DIVERGENCE: tmux shows running but bead disagrees.
d.logger.Printf("Witness for %s session healthy (Claude running), skipping restart despite stale bead", rigName) // Don't kill (safety), but nudge the agent to reconcile its state.
d.logger.Printf("STATE DIVERGENCE: Witness for %s bead='%s' but Claude is running in tmux", rigName, beadState)
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + agentID + " running"
if err := d.tmux.NudgeSession(sessionName, nudgeMsg); err != nil {
d.logger.Printf("Warning: failed to nudge Witness %s about state divergence: %v", rigName, err)
}
return return
} }
d.logger.Printf("Error starting witness for %s: %v", rigName, err) d.logger.Printf("Error starting witness for %s: %v", rigName, err)
@@ -522,8 +534,13 @@ func (d *Daemon) ensureRefineryRunning(rigName string) {
if err := mgr.Start(false); err != nil { if err := mgr.Start(false); err != nil {
if err == refinery.ErrAlreadyRunning { if err == refinery.ErrAlreadyRunning {
// Session is healthy (Claude running) - bead state was stale // STATE DIVERGENCE: tmux shows running but bead disagrees.
d.logger.Printf("Refinery for %s session healthy (Claude running), skipping restart despite stale bead", rigName) // Don't kill (safety), but nudge the agent to reconcile its state.
d.logger.Printf("STATE DIVERGENCE: Refinery for %s bead='%s' but Claude is running in tmux", rigName, beadState)
nudgeMsg := "[DAEMON] State divergence detected: your agent bead shows '" + beadState + "' but you appear running. Please run: bd agent state " + agentID + " running"
if err := d.tmux.NudgeSession(sessionName, nudgeMsg); err != nil {
d.logger.Printf("Warning: failed to nudge Refinery %s about state divergence: %v", rigName, err)
}
return return
} }
d.logger.Printf("Error starting refinery for %s: %v", rigName, err) d.logger.Printf("Error starting refinery for %s: %v", rigName, err)